diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h index 8e2574974a82d..d969178f74179 100644 --- a/llvm/include/llvm/CodeGen/MachineInstr.h +++ b/llvm/include/llvm/CodeGen/MachineInstr.h @@ -1473,8 +1473,9 @@ class MachineInstr /// Return true is the instruction is an identity copy. bool isIdentityCopy() const { - return isCopy() && getOperand(0).getReg() == getOperand(1).getReg() && - getOperand(0).getSubReg() == getOperand(1).getSubReg(); + return (isCopy() || isCopyLaneMask()) && + getOperand(0).getReg() == getOperand(1).getReg() && + getOperand(0).getSubReg() == getOperand(1).getSubReg(); } /// Return true if this is a transient instruction that is either very likely diff --git a/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp b/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp index c4ba4195f307f..7a291844c8670 100644 --- a/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp +++ b/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp @@ -168,6 +168,7 @@ bool ExpandPostRA::run(MachineFunction &MF) { MadeChange |= LowerSubregToReg(&MI); break; case TargetOpcode::COPY: + case TargetOpcode::COPY_LANEMASK: TII->lowerCopy(&MI, TRI); MadeChange = true; break; diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index fef3a3663d3a8..b8b83e626fe86 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -884,12 +884,21 @@ void TargetInstrInfo::lowerCopy( if (IdentityCopy || SrcMO.isUndef()) { // No need to insert an identity copy instruction, but replace with a KILL // if liveness is changed. - if (SrcMO.isUndef() || MI->getNumOperands() > 2) { + if (MI->getOpcode() == TargetOpcode::COPY && + (SrcMO.isUndef() || MI->getNumOperands() > 2)) { // We must make sure the super-register gets killed. Replace the // instruction with KILL. MI->setDesc(get(TargetOpcode::KILL)); return; } + if (MI->getOpcode() == TargetOpcode::COPY_LANEMASK && + (SrcMO.isUndef() || MI->getNumOperands() > 3)) { + // We must make sure the super-register gets killed. Replace the + // instruction with KILL. + MI->setDesc(get(TargetOpcode::KILL)); + return; + } + // Vanilla identity copy. MI->eraseFromParent(); return; @@ -900,7 +909,10 @@ void TargetInstrInfo::lowerCopy( DstMO.getReg().isPhysical() ? DstMO.isRenamable() : false, SrcMO.getReg().isPhysical() ? SrcMO.isRenamable() : false); - if (MI->getNumOperands() > 2) + if (MI->getOpcode() == TargetOpcode::COPY && MI->getNumOperands() > 2) + transferImplicitOperands(MI, &TRI); + if (MI->getOpcode() == TargetOpcode::COPY_LANEMASK && + MI->getNumOperands() > 3) transferImplicitOperands(MI, &TRI); MI->eraseFromParent(); } diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp index 972bd8f550e8b..cab3fca289a32 100644 --- a/llvm/lib/CodeGen/VirtRegMap.cpp +++ b/llvm/lib/CodeGen/VirtRegMap.cpp @@ -213,6 +213,8 @@ class VirtRegRewriter { void rewrite(); void addMBBLiveIns(); bool readsUndefSubreg(const MachineOperand &MO) const; + LaneBitmask calcLiveRegUnitMask(const MachineOperand &MO, + MCRegister PhysReg) const; void addLiveInsForSubRanges(const LiveInterval &LI, MCRegister PhysReg) const; void handleIdentityCopy(MachineInstr &MI); void expandCopyBundle(MachineInstr &MI) const; @@ -474,6 +476,80 @@ bool VirtRegRewriter::readsUndefSubreg(const MachineOperand &MO) const { return true; } +// Return LaneBitmask value for PhysReg assigned to MO, +// representing its live register units at its parent MI. In case of undef or +// fully live MO, return 0u. +LaneBitmask VirtRegRewriter::calcLiveRegUnitMask(const MachineOperand &MO, + MCRegister PhysReg) const { + Register Reg = MO.getReg(); + const LiveInterval &LI = LIS->getInterval(Reg); + const MachineInstr &MI = *MO.getParent(); + SlotIndex MIIndex = LIS->getInstructionIndex(MI); + unsigned SubRegIdx = MO.getSubReg(); + LaneBitmask UseMask = SubRegIdx + ? TRI->getSubRegIndexLaneMask(SubRegIdx) + : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg) + : LaneBitmask::getNone()); + + LaneBitmask LiveRegUnitMask; + DenseSet LiveRegUnits; + + // dbgs() << "\n********** " << printReg(Reg, TRI) << "[ " << + // printReg(PhysReg, TRI) << " ]" << " **********\n"; + + if (MO.isUndef()) + return LaneBitmask::getNone(); + + assert(LI.liveAt(MIIndex) && + "Reads of completely dead register should be marked undef already"); + + if (LI.hasSubRanges()) { + for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { + MCRegUnit Unit = (*Units).first; + LaneBitmask Mask = (*Units).second; + for (const LiveInterval::SubRange &S : LI.subranges()) { + if ((S.LaneMask & UseMask & Mask).any() && S.liveAt(MIIndex)) { + LiveRegUnits.insert(Unit); + } + } + } + } else { + for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { + MCRegUnit Unit = (*Units).first; + const LiveRange &UnitRange = LIS->getRegUnit(Unit); + LaneBitmask Mask = (*Units).second; + + if (UnitRange.liveAt(MIIndex) && (UseMask & Mask).any()) + LiveRegUnits.insert(Unit); + } + } + + // Consider the exact subregister & create new UseMask as per the RC for it. + if (SubRegIdx != 0) { + PhysReg = TRI->getSubReg(PhysReg, SubRegIdx); + UseMask = (TRI->getMinimalPhysRegClass(PhysReg))->getLaneMask(); + } + + for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { + MCRegUnit Unit = (*Units).first; + LaneBitmask Mask = (*Units).second; + if (LiveRegUnits.count(Unit)) { + // dbgs() << "LIVE DEF UNIT : " << printRegUnit(Unit, TRI) << '\n'; + LiveRegUnitMask |= Mask; + } + } + + // dbgs() << "UseMask : " << PrintLaneMask(UseMask) << '\n'; + // dbgs() << "LiveRegUnitMask : " << PrintLaneMask(LiveRegUnitMask) << '\n'; + // If all lanes are live or dead, no need to create a COPY_LANEMASK + // instruction. + if (LiveRegUnitMask.all() || LiveRegUnitMask.none() || + LiveRegUnitMask == UseMask) + return LaneBitmask::getNone(); + + return LiveRegUnitMask; +} + void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) { if (!MI.isIdentityCopy()) return; @@ -495,7 +571,14 @@ void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) { // give us additional liveness information: The target (super-)register // must not be valid before this point. Replace the COPY with a KILL // instruction to maintain this information. - if (MI.getOperand(1).isUndef() || MI.getNumOperands() > 2) { + if (MI.getOpcode() == TargetOpcode::COPY && + (MI.getOperand(1).isUndef() || MI.getNumOperands() > 2)) { + MI.setDesc(TII->get(TargetOpcode::KILL)); + LLVM_DEBUG(dbgs() << " replace by: " << MI); + return; + } + if (MI.getOpcode() == TargetOpcode::COPY_LANEMASK && + (MI.getOperand(1).isUndef() || MI.getNumOperands() > 3)) { MI.setDesc(TII->get(TargetOpcode::KILL)); LLVM_DEBUG(dbgs() << " replace by: " << MI); return; @@ -641,11 +724,14 @@ void VirtRegRewriter::rewrite() { SmallVector SuperDeads; SmallVector SuperDefs; SmallVector SuperKills; + LaneBitmask LaneMask; for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end(); MBBI != MBBE; ++MBBI) { LLVM_DEBUG(MBBI->print(dbgs(), Indexes)); for (MachineInstr &MI : llvm::make_early_inc_range(MBBI->instrs())) { + // reset for each MI. + LaneMask = LaneBitmask::getNone(); for (MachineOperand &MO : MI.operands()) { // Make sure MRI knows about registers clobbered by regmasks. if (MO.isRegMask()) @@ -663,6 +749,9 @@ void VirtRegRewriter::rewrite() { RewriteRegs.insert(PhysReg); assert(!MRI->isReserved(PhysReg) && "Reserved register assignment"); + if (MO.isUse() && MI.isCopy()) + LaneMask = calcLiveRegUnitMask(MO, PhysReg); + // Preserve semantics of sub-register operands. unsigned SubReg = MO.getSubReg(); if (SubReg != 0) { @@ -739,6 +828,13 @@ void VirtRegRewriter::rewrite() { MO.setIsRenamable(true); } + // If there are any live lanes, replace a COPY instruction with a + // COPY_LANEMASK instruction with the lane mask. + if (MI.isCopy() && LaneMask.any()) { + MI.setDesc(TII->get(TargetOpcode::COPY_LANEMASK)); + MI.addOperand(*MF, MachineOperand::CreateLaneMask(LaneMask)); + } + // Add any missing super-register kills after rewriting the whole // instruction. while (!SuperKills.empty()) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 6d2110957002a..d317cadf15be2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -694,16 +694,8 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII, I->clearRegisterKills(DefOp.getReg(), &RI); } - MachineInstrBuilder Builder = - BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) - .add(DefOp); - if (ImpDefSuperReg) - Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); - - if (ImpUseSuperReg) { - Builder.addReg(ImpUseSuperReg, - getKillRegState(KillSrc) | RegState::Implicit); - } + BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) + .add(DefOp); return; } @@ -747,27 +739,26 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp) .addReg(SrcReg, getKillRegState(KillSrc)); - if (ImpUseSuperReg) { - UseBuilder.addReg(ImpUseSuperReg, - getKillRegState(KillSrc) | RegState::Implicit); - } - MachineInstrBuilder DefBuilder - = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) - .addReg(Tmp, RegState::Kill); - - if (ImpDefSuperReg) - DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); + BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) + .addReg(Tmp, RegState::Kill); } static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, - const TargetRegisterClass *RC, bool Forward) { + const TargetRegisterClass *RC, bool Forward, + uint64_t LiveRegUnitMaskVal) { const SIRegisterInfo &RI = TII.getRegisterInfo(); ArrayRef BaseIndices = RI.getRegSplitParts(RC, 4); MachineBasicBlock::iterator I = MI; - MachineInstr *FirstMI = nullptr, *LastMI = nullptr; + bool isSrcRegFullLive = LiveRegUnitMaskVal == 0; + + uint64_t TestMaskVal = 0x0000000000000003; + uint8_t ShiftVal = 2; + + if (!Forward) + TestMaskVal = TestMaskVal << (ShiftVal * (BaseIndices.size() - 1)); for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) { int16_t SubIdx = BaseIndices[Idx]; @@ -775,41 +766,47 @@ static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); assert(DestSubReg && SrcSubReg && "Failed to find subregs!"); unsigned Opcode = AMDGPU::S_MOV_B32; + bool IsFirstSubreg = Idx == 0; + + if (!IsFirstSubreg) { + TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal; + } + + // Check for liveness of current subregister using TestMaskVal. + if (!isSrcRegFullLive && (LiveRegUnitMaskVal & TestMaskVal) == uint64_t(0)) + continue; // Is SGPR aligned? If so try to combine with next. bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0; bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0; - if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) { + bool isSrc64Live = true; + + if (!isSrcRegFullLive) + isSrc64Live = Forward + ? ((LiveRegUnitMaskVal & (TestMaskVal << ShiftVal)) != + uint64_t(0)) + : ((LiveRegUnitMaskVal & (TestMaskVal >> ShiftVal)) != + uint64_t(0)); + + if (isSrc64Live && AlignedDest && AlignedSrc && + (Idx + 1 < BaseIndices.size())) { // Can use SGPR64 copy unsigned Channel = RI.getChannelFromSubReg(SubIdx); SubIdx = RI.getSubRegFromChannel(Channel, 2); DestSubReg = RI.getSubReg(DestReg, SubIdx); SrcSubReg = RI.getSubReg(SrcReg, SubIdx); assert(DestSubReg && SrcSubReg && "Failed to find subregs!"); + TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal; Opcode = AMDGPU::S_MOV_B64; Idx++; } - LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg) - .addReg(SrcSubReg) - .addReg(SrcReg, RegState::Implicit); - - if (!FirstMI) - FirstMI = LastMI; + BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg) + .addReg(SrcSubReg, getKillRegState(KillSrc)); if (!Forward) I--; } - - assert(FirstMI && LastMI); - if (!Forward) - std::swap(FirstMI, LastMI); - - FirstMI->addOperand( - MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/)); - - if (KillSrc) - LastMI->addRegisterKilled(SrcReg, &RI); } void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, @@ -822,6 +819,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg); unsigned SrcSize = RI.getRegSizeInBits(*SrcRC); + uint64_t LiveRegUnitMaskVal = 0; + if (MI->getOpcode() == TargetOpcode::COPY_LANEMASK) { + LiveRegUnitMaskVal = MI->getOperand(2).getLaneMask().getAsInteger(); + } + + bool isSrcRegFullLive = LiveRegUnitMaskVal == 0; + // The rest of copyPhysReg assumes Src and Dst size are the same size. // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can // we remove Fix16BitCopies and this code block? @@ -1043,16 +1047,15 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } if (ST.hasPkMovB32()) { BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg) - .addImm(SISrcMods::OP_SEL_1) - .addReg(SrcReg) - .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) - .addReg(SrcReg) - .addImm(0) // op_sel_lo - .addImm(0) // op_sel_hi - .addImm(0) // neg_lo - .addImm(0) // neg_hi - .addImm(0) // clamp - .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); + .addImm(SISrcMods::OP_SEL_1) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addImm(0) // op_sel_lo + .addImm(0) // op_sel_hi + .addImm(0) // neg_lo + .addImm(0) // neg_hi + .addImm(0); // clamp return; } } @@ -1065,12 +1068,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg); expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC, - Forward); + Forward, LiveRegUnitMaskVal); return; } unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; + uint64_t TestMaskVal = 0x0000000000000003; + uint8_t ShiftVal = 2; if (RI.isAGPRClass(RC)) { if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC)) Opcode = AMDGPU::V_ACCVGPR_MOV_B32; @@ -1085,12 +1090,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, (RI.isProperlyAlignedRC(*RC) && (SrcRC == RC || RI.isSGPRClass(SrcRC)))) { // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov. + // TODO: In case of partial liveness, could do mix of 64-bit and 32-bit + // moves. Look expandSGPRCopy function for reference. if (ST.hasMovB64()) { Opcode = AMDGPU::V_MOV_B64_e32; EltSize = 8; + TestMaskVal = 0x000000000000000F; + ShiftVal = 4; } else if (ST.hasPkMovB32()) { Opcode = AMDGPU::V_PK_MOV_B32; EltSize = 8; + TestMaskVal = 0x000000000000000F; + ShiftVal = 4; } } @@ -1105,6 +1116,10 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, ArrayRef SubIndices = RI.getRegSplitParts(RC, EltSize); + // The TestMaskVal will scan from right to left. + if (!Forward) + TestMaskVal = TestMaskVal << (ShiftVal * (SubIndices.size() - 1)); + // If there is an overlap, we can't kill the super-register on the last // instruction, since it will also kill the components made live by this def. const bool Overlap = RI.regsOverlap(SrcReg, DestReg); @@ -1121,7 +1136,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, assert(DestSubReg && SrcSubReg && "Failed to find subregs!"); bool IsFirstSubreg = Idx == 0; - bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1; + bool UseKill = CanKillSuperReg; + + if (!IsFirstSubreg) { + TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal; + } + + if (!isSrcRegFullLive && (LiveRegUnitMaskVal & TestMaskVal) == uint64_t(0)) + continue; if (Opcode == AMDGPU::INSTRUCTION_LIST_END) { Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register(); @@ -1132,24 +1154,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg) .addImm(SISrcMods::OP_SEL_1) - .addReg(SrcSubReg) + .addReg(SrcSubReg, getKillRegState(UseKill)) .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) - .addReg(SrcSubReg) - .addImm(0) // op_sel_lo - .addImm(0) // op_sel_hi - .addImm(0) // neg_lo - .addImm(0) // neg_hi - .addImm(0) // clamp - .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); - if (IsFirstSubreg) - MIB.addReg(DestReg, RegState::Define | RegState::Implicit); + .addReg(SrcSubReg, getKillRegState(UseKill)) + .addImm(0) // op_sel_lo + .addImm(0) // op_sel_hi + .addImm(0) // neg_lo + .addImm(0) // neg_hi + .addImm(0); // clamp } else { MachineInstrBuilder Builder = - BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg); - if (IsFirstSubreg) - Builder.addReg(DestReg, RegState::Define | RegState::Implicit); - - Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); + BuildMI(MBB, MI, DL, get(Opcode), DestSubReg) + .addReg(SrcSubReg, getKillRegState(UseKill)); } } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index a3bacfbfe5214..89641cf21c55d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -262,7 +262,9 @@ static bool isConvertibleToVMV_V_V(const RISCVSubtarget &STI, if (PreferWholeRegisterMove) return false; - assert(MBBI->getOpcode() == TargetOpcode::COPY && + // TODO : Support COPY_LANEMASK instruction. + assert((MBBI->getOpcode() == TargetOpcode::COPY || + MBBI->getOpcode() == TargetOpcode::COPY_LANEMASK) && "Unexpected COPY instruction."); Register SrcReg = MBBI->getOperand(1).getReg(); const TargetRegisterInfo *TRI = STI.getRegisterInfo(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll index d6f1b142b36e0..6060e9366cad0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll @@ -438,9 +438,9 @@ define void @s_uaddo_uadde(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, pt ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s16, s18 ; GFX7-NEXT: s_addc_u32 s5, s17, s19 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_cselect_b32 s8, 1, 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -455,8 +455,8 @@ define void @s_uaddo_uadde(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, pt ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s4, s16, s18 ; GFX9-NEXT: s_addc_u32 s5, s17, s19 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_cselect_b32 s6, 1, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX9-NEXT: v_mov_b32_e32 v0, s6 @@ -469,8 +469,8 @@ define void @s_uaddo_uadde(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, pt ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s4, s16, s18 ; GFX8-NEXT: s_addc_u32 s5, s17, s19 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll index bbee88050edb9..54f9ead913b02 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll @@ -676,8 +676,8 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) { ; GFX7-LABEL: s_saddo_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_add_u32 s4, s0, s2 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_addc_u32 s5, s1, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX7-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 @@ -693,8 +693,8 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) { ; GFX8-LABEL: s_saddo_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s4, s0, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_addc_u32 s5, s1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 @@ -710,8 +710,8 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) { ; GFX9-LABEL: s_saddo_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s4, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s5, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll index 65bc2d73b36b6..fb93eff1bec45 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll @@ -96,8 +96,8 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_add_i32_e32 v4, vcc, s4, v0 -; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -192,8 +192,8 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v0 -; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -294,8 +294,8 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) { ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_xor_b32_e32 v4, s4, v0 -; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -392,8 +392,8 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_add_i32_e32 v4, vcc, s4, v0 -; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -492,8 +492,8 @@ define amdgpu_cs void @atomic_ptr_sub_and_format(ptr addrspace(8) inreg %arg) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v0 -; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -598,8 +598,8 @@ define amdgpu_cs void @atomic_ptr_xor_and_format(ptr addrspace(8) inreg %arg) { ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_xor_b32_e32 v4, s4, v0 -; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 8063b29c29985..841de28cd4f82 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -8,6 +8,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s + ; TODO: Delete this and add run lines to use *-atomicrmw-fmax.ll tests define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr, float %val) { @@ -1823,10 +1824,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1865,10 +1864,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1989,10 +1986,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2029,10 +2026,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2079,9 +2076,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v10, v3 ; GFX908-NEXT: v_mov_b32_e32 v9, v2 +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v8, v1 ; GFX908-NEXT: v_mov_b32_e32 v7, v0 ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc @@ -2108,9 +2105,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v10, v3 ; GFX8-NEXT: v_mov_b32_e32 v9, v2 +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index 5b0b602bd99ba..685eaabeb20b0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -8,6 +8,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s + ; TODO: Delete this and add run lines to use *-atomicrmw-fmin.ll tests define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr, float %val) { @@ -1823,10 +1824,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1865,10 +1864,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1989,10 +1986,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2029,10 +2026,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2079,9 +2076,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v10, v3 ; GFX908-NEXT: v_mov_b32_e32 v9, v2 +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v8, v1 ; GFX908-NEXT: v_mov_b32_e32 v7, v0 ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc @@ -2108,9 +2105,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v10, v3 ; GFX8-NEXT: v_mov_b32_e32 v9, v2 +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll index dac726df5decb..9fabaab9bca44 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -25,9 +25,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -43,9 +43,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -108,9 +108,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -126,9 +126,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -1109,9 +1109,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1208,9 +1208,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1375,9 +1375,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1461,9 +1461,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1579,10 +1579,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -1689,11 +1689,11 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -1719,16 +1719,16 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_add_u32 s0, s0, 4 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v4, s0 @@ -1743,16 +1743,16 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s0, 4 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 @@ -1829,15 +1829,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_add_u32 s0, s0, 4 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v4, s0 @@ -1855,15 +1855,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s0, 4 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 @@ -1940,9 +1940,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1956,9 +1956,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2003,9 +2003,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v3, s1 ; GFX11-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2027,8 +2027,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2045,8 +2045,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2080,8 +2080,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 32 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2093,9 +2093,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v3, s1 ; GFX11-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2118,8 +2118,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2136,8 +2136,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2171,8 +2171,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 32 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2184,9 +2184,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v3, s1 ; GFX11-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2218,8 +2218,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: v_add_i32_e32 v4, vcc, 4, v2 @@ -2247,8 +2247,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v2 @@ -2265,16 +2265,16 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -2301,8 +2301,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -2313,10 +2313,11 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -2324,7 +2325,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo @@ -2390,8 +2391,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc @@ -2429,10 +2430,11 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -2460,10 +2462,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; CI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 -; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 +; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2481,10 +2483,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; VI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2559,9 +2561,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -2578,9 +2580,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -2647,9 +2649,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -2666,9 +2668,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -2856,9 +2858,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2875,9 +2877,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2942,8 +2944,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2963,8 +2965,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3030,8 +3032,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3051,8 +3053,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3115,9 +3117,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3131,9 +3133,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3191,8 +3193,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3209,8 +3211,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3269,8 +3271,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3287,8 +3289,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3356,8 +3358,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3382,8 +3384,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3539,10 +3541,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; CI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: v_mov_b32_e32 v4, s3 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 +; CI-NEXT: v_mov_b32_e32 v4, s3 ; CI-NEXT: v_mov_b32_e32 v3, s2 ; CI-NEXT: flat_store_dword v[3:4], v0 ; CI-NEXT: v_mov_b32_e32 v4, s1 @@ -3561,10 +3563,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; VI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: flat_store_dword v[3:4], v0 ; VI-NEXT: v_mov_b32_e32 v4, s1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index 7f3e24f97b6e2..1b93d7c265904 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -26,9 +26,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -44,9 +44,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -121,9 +121,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -139,9 +139,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -1082,10 +1082,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 -; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 +; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1103,10 +1103,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1198,9 +1198,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1217,9 +1217,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1299,9 +1299,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1318,9 +1318,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1543,9 +1543,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1562,9 +1562,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1641,8 +1641,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1662,8 +1662,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1741,8 +1741,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1762,8 +1762,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1840,9 +1840,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1856,9 +1856,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1927,8 +1927,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1945,8 +1945,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2016,8 +2016,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2034,8 +2034,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2116,8 +2116,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -2142,8 +2142,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -2502,9 +2502,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -2516,9 +2516,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX12-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 42 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -2614,9 +2614,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -2628,9 +2628,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; GFX12-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 42 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -2807,9 +2807,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2820,9 +2820,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; GFX12-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 42 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -2904,9 +2904,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2917,9 +2917,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; GFX12-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 42 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 scope:SCOPE_SYS @@ -3035,10 +3035,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -3057,10 +3057,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -3167,11 +3167,11 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -3186,11 +3186,11 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX12-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -3218,10 +3218,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; CI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: v_mov_b32_e32 v4, s3 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 +; CI-NEXT: v_mov_b32_e32 v4, s3 ; CI-NEXT: v_mov_b32_e32 v3, s2 ; CI-NEXT: flat_store_dword v[3:4], v0 ; CI-NEXT: v_mov_b32_e32 v4, s1 @@ -3240,10 +3240,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; VI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: flat_store_dword v[3:4], v0 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -3336,16 +3336,16 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_add_u32 s0, s0, 4 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v4, s0 @@ -3360,16 +3360,16 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s0, 4 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 @@ -3460,15 +3460,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_add_u32 s0, s0, 4 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v4, s0 @@ -3486,15 +3486,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s0, 4 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 @@ -3588,15 +3588,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_add_u32 s0, s0, 4 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v4, s0 @@ -3614,15 +3614,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s0, 4 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 @@ -3715,9 +3715,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3731,9 +3731,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3778,9 +3778,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v3, s1 ; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3792,9 +3792,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 42 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -3814,8 +3814,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3832,8 +3832,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3867,8 +3867,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 32 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3880,9 +3880,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v3, s1 ; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3894,9 +3894,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 42 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -3917,8 +3917,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3935,8 +3935,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3970,8 +3970,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 32 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3983,9 +3983,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v3, s1 ; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3997,9 +3997,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 42 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS @@ -4031,8 +4031,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: v_add_i32_e32 v4, vcc, 4, v2 @@ -4060,8 +4060,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v2 @@ -4078,16 +4078,16 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -4114,8 +4114,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -4126,10 +4126,11 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -4137,7 +4138,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo @@ -4149,17 +4150,18 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v2, 42 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -4226,8 +4228,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc @@ -4265,10 +4267,11 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -4284,10 +4287,11 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v2, 42 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -4316,9 +4320,9 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_rtn_u32 v3, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -4339,9 +4343,9 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_rtn_u32 v3, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll index 62a5313dc8d3c..af29a2f7ba6ca 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll @@ -8,11 +8,10 @@ define void @main(<19 x i32> %arg) { ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, 0 -; GCN-NEXT: s_mov_b32 s12, s4 ; GCN-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b32 s12, s4 ; GCN-NEXT: s_mov_b32 s13, s4 -; GCN-NEXT: v_mov_b32_e32 v4, s12 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_mov_b32 s6, s4 ; GCN-NEXT: s_mov_b32 s7, s4 @@ -23,6 +22,7 @@ define void @main(<19 x i32> %arg) { ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: v_mov_b32_e32 v4, s12 ; GCN-NEXT: v_mov_b32_e32 v5, s13 ; GCN-NEXT: image_store v[0:3], v[4:5], s[4:11] unorm ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -51,7 +51,7 @@ define void @main(<19 x i32> %arg) { ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: v_mov_b32_e32 v2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll index 94b956ef254a5..e437877956a93 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -476,10 +476,10 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -696,6 +696,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; VI-NEXT: flat_load_ubyte v3, v[6:7] ; VI-NEXT: flat_load_ubyte v4, v[4:5] ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -706,7 +707,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v1, v2, v3 ; VI-NEXT: v_or_b32_e32 v3, v1, v0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 @@ -884,9 +884,9 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; VI-NEXT: flat_load_ubyte v4, v[8:9] ; VI-NEXT: flat_load_ubyte v5, v[10:11] ; VI-NEXT: flat_load_ubyte v6, v[12:13] -; VI-NEXT: v_mov_b32_e32 v8, s1 ; VI-NEXT: v_mov_b32_e32 v7, s0 ; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: v_mov_b32_e32 v8, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v10, s1 ; VI-NEXT: v_mov_b32_e32 v9, s0 @@ -957,9 +957,9 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_mov_b32_e32 v8, s0 ; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v11, s1 ; VI-NEXT: v_mov_b32_e32 v10, s0 @@ -1013,11 +1013,11 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1059,10 +1059,10 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1104,10 +1104,10 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1174,6 +1174,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; VI-NEXT: flat_load_ubyte v3, v[6:7] ; VI-NEXT: flat_load_ubyte v4, v[4:5] ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -1184,7 +1185,6 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v1, v2, v3 ; VI-NEXT: v_or_b32_e32 v3, v1, v0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 @@ -1229,10 +1229,10 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1273,10 +1273,10 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1318,10 +1318,10 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1362,10 +1362,10 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll index 121dd309fddf9..cb2edcfba4ee5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll @@ -557,14 +557,14 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: s_mov_b32 s8, exec_lo ; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 ; GFX10-NEXT: s_andn2_b32 s1, s7, exec_lo -; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: s_and_b32 s7, exec_lo, s8 ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo ; GFX10-NEXT: s_and_b32 s8, exec_lo, exec_lo -; GFX10-NEXT: s_or_b32 s7, s1, s7 ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo +; GFX10-NEXT: s_or_b32 s7, s1, s7 ; GFX10-NEXT: s_or_b32 s6, s6, s8 ; GFX10-NEXT: global_load_dword v6, v[6:7], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -573,8 +573,8 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: s_cbranch_execz .LBB7_1 ; GFX10-NEXT: ; %bb.3: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB7_2 Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: s_add_i32 s2, s0, 1 ; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64 ; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll index 5c57d355959ef..fda8d4187d42a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll @@ -123,8 +123,8 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: s_ashr_i32 s1, s0, 31 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 ; GFX10-NEXT: s_andn2_b32 s1, s5, exec_lo -; GFX10-NEXT: v_mov_b32_e32 v5, s3 ; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 ; GFX10-NEXT: s_and_b32 s5, exec_lo, exec_lo ; GFX10-NEXT: s_or_b32 s5, s1, s5 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v2, v4 @@ -136,8 +136,8 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: s_cbranch_execz .LBB2_1 ; GFX10-NEXT: ; %bb.3: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 ; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 ; GFX10-NEXT: s_add_i32 s2, s0, 1 ; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64 ; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 @@ -145,8 +145,8 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo ; GFX10-NEXT: s_andn2_b32 s3, s5, exec_lo ; GFX10-NEXT: s_and_b32 s0, exec_lo, s0 -; GFX10-NEXT: s_or_b32 s5, s3, s0 ; GFX10-NEXT: global_load_dword v6, v[4:5], off +; GFX10-NEXT: s_or_b32 s5, s3, s0 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v6 @@ -203,8 +203,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_ashr_i32 s1, s0, 31 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 ; GFX10-NEXT: s_andn2_b32 s1, s5, exec_lo -; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: s_and_b32 s5, exec_lo, exec_lo ; GFX10-NEXT: s_or_b32 s5, s1, s5 ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 @@ -216,8 +216,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_cbranch_execz .LBB3_2 ; GFX10-NEXT: ; %bb.4: ; %B ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v4, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v5, v7, vcc_lo @@ -228,8 +228,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_cbranch_execz .LBB3_1 ; GFX10-NEXT: ; %bb.5: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: s_add_i32 s2, s0, 1 ; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64 ; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 @@ -237,8 +237,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo ; GFX10-NEXT: s_andn2_b32 s3, s6, exec_lo ; GFX10-NEXT: s_and_b32 s0, exec_lo, s0 -; GFX10-NEXT: s_or_b32 s6, s3, s0 ; GFX10-NEXT: global_load_dword v8, v[6:7], off +; GFX10-NEXT: s_or_b32 s6, s3, s0 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8 @@ -307,8 +307,8 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_ashr_i32 s1, s0, 31 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 ; GFX10-NEXT: s_andn2_b32 s1, s5, exec_lo -; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: v_mov_b32_e32 v8, s2 +; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: s_and_b32 s5, exec_lo, exec_lo ; GFX10-NEXT: s_or_b32 s5, s1, s5 ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v2, v8 @@ -320,8 +320,8 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_cbranch_execz .LBB4_3 ; GFX10-NEXT: ; %bb.5: ; %B ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: v_mov_b32_e32 v8, s2 +; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, v8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v9, vcc_lo @@ -332,8 +332,8 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_cbranch_execz .LBB4_2 ; GFX10-NEXT: ; %bb.6: ; %C ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: v_mov_b32_e32 v8, s2 +; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: s_mov_b32 s8, exec_lo ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v6, v8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v7, v9, vcc_lo @@ -344,8 +344,8 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_cbranch_execz .LBB4_1 ; GFX10-NEXT: ; %bb.7: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: v_mov_b32_e32 v8, s2 +; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: s_add_i32 s2, s0, 1 ; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64 ; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 @@ -353,8 +353,8 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo ; GFX10-NEXT: s_andn2_b32 s3, s8, exec_lo ; GFX10-NEXT: s_and_b32 s0, exec_lo, s0 -; GFX10-NEXT: s_or_b32 s8, s3, s0 ; GFX10-NEXT: global_load_dword v10, v[8:9], off +; GFX10-NEXT: s_or_b32 s8, s3, s0 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v10 @@ -427,14 +427,14 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad ; GFX10-NEXT: s_mov_b32 s8, exec_lo ; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 ; GFX10-NEXT: s_andn2_b32 s1, s7, exec_lo -; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: s_and_b32 s7, exec_lo, s8 ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo ; GFX10-NEXT: s_and_b32 s8, exec_lo, exec_lo -; GFX10-NEXT: s_or_b32 s7, s1, s7 ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo +; GFX10-NEXT: s_or_b32 s7, s1, s7 ; GFX10-NEXT: s_or_b32 s6, s6, s8 ; GFX10-NEXT: global_load_dword v6, v[6:7], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -443,8 +443,8 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad ; GFX10-NEXT: s_cbranch_execz .LBB5_1 ; GFX10-NEXT: ; %bb.3: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: s_add_i32 s2, s0, 1 ; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64 ; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll index a8b27ecd7e9fc..eab7a43c32c5c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll @@ -414,8 +414,8 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i ; GFX10-NEXT: ; %bb.3: ; %UseInst ; GFX10-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX10-NEXT: v_mov_b32_e32 v9, s7 ; GFX10-NEXT: v_mov_b32_e32 v8, s6 +; GFX10-NEXT: v_mov_b32_e32 v9, s7 ; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s9 ; GFX10-NEXT: s_add_i32 s6, s6, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll index 9dfd0a47d1e1e..bc47a8bc1bec7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -41,10 +41,10 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(ptr addrspace(4) inre ; GFX7-NEXT: s_and_b32 s2, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, s2, 4 ; GFX7-NEXT: s_mov_b32 s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -89,10 +89,10 @@ define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr ; GFX9-LABEL: extractelement_vgpr_v4i128_sgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s2, 3 -; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off @@ -106,10 +106,10 @@ define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr ; GFX8-LABEL: extractelement_vgpr_v4i128_sgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s2, 3 -; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll index 798f6eb65e6aa..f87a0385616d0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll @@ -35,10 +35,10 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(ptr addrspace(4) inreg ; GFX7-NEXT: s_and_b32 s2, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, s2, 1 ; GFX7-NEXT: s_mov_b32 s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -74,10 +74,10 @@ define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(ptr addrspace(1) %ptr, ; GFX9-LABEL: extractelement_vgpr_v4i16_sgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s2, 3 -; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: global_load_ushort v0, v[0:1], off @@ -88,10 +88,10 @@ define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(ptr addrspace(1) %ptr, ; GFX8-LABEL: extractelement_vgpr_v4i16_sgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s2, 3 -; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: flat_load_ushort v0, v[0:1] @@ -679,10 +679,10 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_sgpr_idx(ptr addrspace(4) inreg ; GFX7-NEXT: s_and_b32 s2, s4, 7 ; GFX7-NEXT: s_lshl_b32 s4, s2, 1 ; GFX7-NEXT: s_mov_b32 s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -718,10 +718,10 @@ define amdgpu_ps i16 @extractelement_vgpr_v8i16_sgpr_idx(ptr addrspace(1) %ptr, ; GFX9-LABEL: extractelement_vgpr_v8i16_sgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s2, 7 -; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: global_load_ushort v0, v[0:1], off @@ -732,10 +732,10 @@ define amdgpu_ps i16 @extractelement_vgpr_v8i16_sgpr_idx(ptr addrspace(1) %ptr, ; GFX8-LABEL: extractelement_vgpr_v8i16_sgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s2, 7 -; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: flat_load_ushort v0, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll index de1079196223a..44ed74fd072b1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -35,11 +35,11 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(ptr addrspace(4) inreg %p ; GFX7: ; %bb.0: ; GFX7-NEXT: s_and_b32 s4, s4, 3 ; GFX7-NEXT: s_ashr_i32 s5, s4, 31 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -79,8 +79,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(ptr addrspace(1) %ptr, i3 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s2, 3 ; GFX9-NEXT: s_ashr_i32 s1, s0, 31 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off @@ -92,8 +92,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(ptr addrspace(1) %ptr, i3 ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s2, 3 ; GFX8-NEXT: s_ashr_i32 s1, s0, 31 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] @@ -116,8 +116,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(ptr addrspace(1) %ptr, i3 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s0, s2, 3 ; GFX10-NEXT: s_ashr_i32 s1, s0, 31 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off @@ -130,7 +130,7 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(ptr addrspace(1) %ptr, i3 ; GFX11-NEXT: s_and_b32 s0, s2, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_ashr_i32 s1, s0, 31 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo @@ -262,8 +262,8 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(ptr addrspace(4) inreg %p ; ; GFX11-LABEL: extractelement_sgpr_v4i8_vgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v2, 3, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_and_b32 v2, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 @@ -688,11 +688,11 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(ptr addrspace(4) inreg %p ; GFX7: ; %bb.0: ; GFX7-NEXT: s_and_b32 s4, s4, 7 ; GFX7-NEXT: s_ashr_i32 s5, s4, 31 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -732,8 +732,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(ptr addrspace(1) %ptr, i3 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s2, 7 ; GFX9-NEXT: s_ashr_i32 s1, s0, 31 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off @@ -745,8 +745,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(ptr addrspace(1) %ptr, i3 ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s2, 7 ; GFX8-NEXT: s_ashr_i32 s1, s0, 31 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] @@ -769,8 +769,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(ptr addrspace(1) %ptr, i3 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s0, s2, 7 ; GFX10-NEXT: s_ashr_i32 s1, s0, 31 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off @@ -783,7 +783,7 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(ptr addrspace(1) %ptr, i3 ; GFX11-NEXT: s_and_b32 s0, s2, 7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_ashr_i32 s1, s0, 31 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo @@ -915,8 +915,8 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(ptr addrspace(4) inreg %p ; ; GFX11-LABEL: extractelement_sgpr_v8i8_vgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v2, 7, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_and_b32 v2, 7, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 @@ -1725,11 +1725,11 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(ptr addrspace(4) inreg % ; GFX7: ; %bb.0: ; GFX7-NEXT: s_and_b32 s4, s4, 15 ; GFX7-NEXT: s_ashr_i32 s5, s4, 31 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1769,8 +1769,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(ptr addrspace(1) %ptr, i ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s2, 15 ; GFX9-NEXT: s_ashr_i32 s1, s0, 31 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off @@ -1782,8 +1782,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(ptr addrspace(1) %ptr, i ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s2, 15 ; GFX8-NEXT: s_ashr_i32 s1, s0, 31 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] @@ -1806,8 +1806,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(ptr addrspace(1) %ptr, i ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s0, s2, 15 ; GFX10-NEXT: s_ashr_i32 s1, s0, 31 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off @@ -1820,7 +1820,7 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(ptr addrspace(1) %ptr, i ; GFX11-NEXT: s_and_b32 s0, s2, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_ashr_i32 s1, s0, 31 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo @@ -1952,8 +1952,8 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(ptr addrspace(4) inreg % ; ; GFX11-LABEL: extractelement_sgpr_v16i8_vgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_and_b32 v2, 15, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 206011adf0213..1f1603de6ed26 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -3183,10 +3183,10 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5] ; MOVREL-NEXT: s_cmp_eq_u32 s8, 4 ; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; MOVREL-NEXT: v_mov_b32_e32 v0, s2 -; MOVREL-NEXT: v_mov_b32_e32 v3, s1 ; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; MOVREL-NEXT: v_mov_b32_e32 v0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v1, s3 +; MOVREL-NEXT: v_mov_b32_e32 v3, s1 ; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; MOVREL-NEXT: s_endpgm @@ -4189,8 +4189,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: s_cselect_b32 s3, 0x40400000, s3 ; MOVREL-NEXT: s_cmp_eq_u32 s2, 3 ; MOVREL-NEXT: s_cselect_b32 s2, 4.0, s3 -; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: v_mov_b32_e32 v2, s2 +; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 ; MOVREL-NEXT: flat_store_dword v[0:1], v2 ; MOVREL-NEXT: s_endpgm @@ -4541,10 +4541,10 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; MOVREL-NEXT: s_cmp_eq_u32 s6, 3 ; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] -; MOVREL-NEXT: v_mov_b32_e32 v0, s2 -; MOVREL-NEXT: v_mov_b32_e32 v3, s1 ; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; MOVREL-NEXT: v_mov_b32_e32 v0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v1, s3 +; MOVREL-NEXT: v_mov_b32_e32 v3, s1 ; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; MOVREL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll index 39a793ce67bb9..79610c43fdbca 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll @@ -254,7 +254,7 @@ define amdgpu_ps void @s_fabs_v2f32(<2 x float> inreg %in, ptr addrspace(1) %out ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; @@ -263,7 +263,7 @@ define amdgpu_ps void @s_fabs_v2f32(<2 x float> inreg %in, ptr addrspace(1) %out ; GFX12-NEXT: s_bitset0_b32 s0, 31 ; GFX12-NEXT: s_bitset0_b32 s1, 31 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll index 3d224f2f6bf05..9429576474825 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -new-reg-bank-select -mattr=+enable-flat-scratch -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=MESA %s ; RUN: llc -global-isel -new-reg-bank-select -mattr=+enable-flat-scratch -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=PAL %s + ; Test that the initialization for flat_scratch doesn't crash. PAL ; doesn't add a user SGPR for initializing flat_scratch, mesa does ; (although this probably isn't actually defined). @@ -10,11 +11,11 @@ define amdgpu_ps void @amdgpu_ps() { ; MESA-LABEL: amdgpu_ps: ; MESA: ; %bb.0: ; MESA-NEXT: s_mov_b64 s[0:1], src_private_base -; MESA-NEXT: s_mov_b32 s0, 0 ; MESA-NEXT: s_add_u32 flat_scratch_lo, s2, s4 -; MESA-NEXT: v_mov_b32_e32 v0, s0 +; MESA-NEXT: s_mov_b32 s0, 0 ; MESA-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; MESA-NEXT: v_mov_b32_e32 v2, 0 +; MESA-NEXT: v_mov_b32_e32 v0, s0 ; MESA-NEXT: v_mov_b32_e32 v1, s1 ; MESA-NEXT: flat_store_dword v[0:1], v2 ; MESA-NEXT: s_waitcnt vmcnt(0) @@ -31,8 +32,8 @@ define amdgpu_ps void @amdgpu_ps() { ; PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0 ; PAL-NEXT: s_mov_b64 s[0:1], src_private_base ; PAL-NEXT: s_mov_b32 s0, 0 -; PAL-NEXT: v_mov_b32_e32 v0, s0 ; PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; PAL-NEXT: v_mov_b32_e32 v0, s0 ; PAL-NEXT: v_mov_b32_e32 v1, s1 ; PAL-NEXT: flat_store_dword v[0:1], v2 ; PAL-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll index ebc28cb005538..7e1c405c0d6d1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll @@ -254,7 +254,7 @@ define amdgpu_ps void @s_fneg_v2f32(<2 x float> inreg %in, ptr addrspace(1) %out ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; @@ -263,7 +263,7 @@ define amdgpu_ps void @s_fneg_v2f32(<2 x float> inreg %in, ptr addrspace(1) %out ; GFX12-NEXT: s_xor_b32 s0, s0, 0x80000000 ; GFX12-NEXT: s_xor_b32 s1, s1, 0x80000000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm %fneg = fneg <2 x float> %in diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll index 91ee7642790fc..5a527c61df424 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -2,11 +2,12 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V5 %s ; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V5 %s - ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s ; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s + + define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) { ; GFX8V4-LABEL: addrspacecast: ; GFX8V4: ; %bb.0: @@ -15,16 +16,16 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8V4-NEXT: s_add_u32 s2, s6, 0x44 ; GFX8V4-NEXT: s_addc_u32 s3, s7, 0 -; GFX8V4-NEXT: v_mov_b32_e32 v0, s2 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1 -; GFX8V4-NEXT: v_mov_b32_e32 v1, s3 +; GFX8V4-NEXT: v_mov_b32_e32 v0, s2 ; GFX8V4-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8V4-NEXT: s_and_b32 s4, 1, s2 -; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8V4-NEXT: s_add_u32 s2, s6, 64 -; GFX8V4-NEXT: flat_load_dword v3, v[0:1] +; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s3 ; GFX8V4-NEXT: s_addc_u32 s3, s7, 0 +; GFX8V4-NEXT: flat_load_dword v3, v[0:1] ; GFX8V4-NEXT: v_mov_b32_e32 v0, s2 ; GFX8V4-NEXT: v_mov_b32_e32 v1, s3 ; GFX8V4-NEXT: flat_load_dword v4, v[0:1] @@ -59,17 +60,17 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V5-NEXT: s_mov_b32 s4, s0 ; GFX8V5-NEXT: s_mov_b32 s5, s2 ; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1 -; GFX8V5-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 ; GFX8V5-NEXT: s_mov_b32 s2, s1 +; GFX8V5-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 ; GFX8V5-NEXT: s_cmp_lg_u32 s1, -1 -; GFX8V5-NEXT: v_mov_b32_e32 v0, s4 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], s[2:3], 0 ; GFX8V5-NEXT: v_mov_b32_e32 v2, 1 +; GFX8V5-NEXT: v_mov_b32_e32 v0, s4 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s5 ; GFX8V5-NEXT: flat_store_dword v[0:1], v2 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) -; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V5-NEXT: v_mov_b32_e32 v2, 2 +; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s1 ; GFX8V5-NEXT: flat_store_dword v[0:1], v2 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) @@ -85,17 +86,17 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_mov_b32 s0, s4 ; GFX9V4-NEXT: s_cmp_lg_u32 s4, -1 -; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 ; GFX9V4-NEXT: s_mov_b32 s2, s5 +; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 ; GFX9V4-NEXT: s_cmp_lg_u32 s5, -1 -; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V4-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 ; GFX9V4-NEXT: v_mov_b32_e32 v2, 1 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V4-NEXT: v_mov_b32_e32 v1, s1 ; GFX9V4-NEXT: flat_store_dword v[0:1], v2 ; GFX9V4-NEXT: s_waitcnt vmcnt(0) -; GFX9V4-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V4-NEXT: v_mov_b32_e32 v2, 2 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V4-NEXT: v_mov_b32_e32 v1, s3 ; GFX9V4-NEXT: flat_store_dword v[0:1], v2 ; GFX9V4-NEXT: s_waitcnt vmcnt(0) @@ -111,17 +112,17 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_mov_b32 s0, s4 ; GFX9V5-NEXT: s_cmp_lg_u32 s4, -1 -; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 ; GFX9V5-NEXT: s_mov_b32 s2, s5 +; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 ; GFX9V5-NEXT: s_cmp_lg_u32 s5, -1 -; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V5-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 ; GFX9V5-NEXT: v_mov_b32_e32 v2, 1 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V5-NEXT: v_mov_b32_e32 v1, s1 ; GFX9V5-NEXT: flat_store_dword v[0:1], v2 ; GFX9V5-NEXT: s_waitcnt vmcnt(0) -; GFX9V5-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V5-NEXT: v_mov_b32_e32 v2, 2 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V5-NEXT: v_mov_b32_e32 v1, s3 ; GFX9V5-NEXT: flat_store_dword v[0:1], v2 ; GFX9V5-NEXT: s_waitcnt vmcnt(0) @@ -295,11 +296,11 @@ define amdgpu_kernel void @llvm_debugtrap() #0 { define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_add_u32 s0, s8, 8 ; GFX8V4-NEXT: v_mov_b32_e32 v0, s6 ; GFX8V4-NEXT: v_mov_b32_e32 v1, s7 -; GFX8V4-NEXT: s_add_u32 s0, s8, 8 -; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V4-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V4-NEXT: s_waitcnt vmcnt(0) ; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V4-NEXT: v_mov_b32_e32 v1, s1 @@ -321,11 +322,11 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { ; ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_add_u32 s0, s8, 8 ; GFX8V5-NEXT: v_mov_b32_e32 v0, s6 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s7 -; GFX8V5-NEXT: s_add_u32 s0, s8, 8 -; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V5-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll index 0e1bbbd1ea92b..8b9dca591d5fb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -1128,9 +1128,9 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 ; GFX10-NEXT: v_and_or_b32 v5, v5, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1150,11 +1150,11 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_lshlrev_b32 v1, 4, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v5, s0, v0 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, s2 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 ; GFX11-NEXT: v_not_b32_e32 v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -1263,8 +1263,8 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX10-NEXT: v_not_b32_e32 v3, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 ; GFX10-NEXT: v_and_or_b32 v5, v5, v3, v2 @@ -1288,8 +1288,8 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v2, v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 ; GFX11-NEXT: v_not_b32_e32 v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -1768,8 +1768,8 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: s_cselect_b32 s2, s4, s2 ; GFX7-NEXT: s_cmp_eq_u32 s6, 3 ; GFX7-NEXT: s_cselect_b32 s3, s4, s3 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 @@ -1801,14 +1801,14 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(ptr addrspace(4) inreg %ptr, i1 ; GFX10-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10-NEXT: s_cselect_b32 s0, s4, s0 ; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_cselect_b32 s1, s4, s1 ; GFX10-NEXT: s_cmp_eq_u32 s6, 2 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_cselect_b32 s2, s4, s2 ; GFX10-NEXT: s_cmp_eq_u32 s6, 3 -; GFX10-NEXT: s_cselect_b32 s3, s4, s3 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_cselect_b32 s3, s4, s3 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm @@ -1836,15 +1836,16 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: s_cmp_eq_u32 s6, 0 ; GFX11-NEXT: s_cselect_b32 s0, s4, s0 ; GFX11-NEXT: s_cmp_eq_u32 s6, 1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_cselect_b32 s1, s4, s1 ; GFX11-NEXT: s_cmp_eq_u32 s6, 2 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_cselect_b32 s2, s4, s2 ; GFX11-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; GFX11-NEXT: s_cselect_b32 s3, s4, s3 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, ptr addrspace(4) %ptr @@ -2315,13 +2316,13 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX10-NEXT: v_not_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s9 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 @@ -2338,27 +2339,29 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 1, v0 ; GFX11-NEXT: v_and_b32_e32 v1, 1, v0 ; GFX11-NEXT: s_and_b32 s1, s4, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_lshlrev_b32 v1, 4, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, s11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v1, s1 ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 +; GFX11-NEXT: v_mov_b32_e32 v1, s9 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_not_b32_e32 v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, s10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 -; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 @@ -2497,13 +2500,13 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX10-NEXT: v_not_b32_e32 v5, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 @@ -2530,14 +2533,14 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 ; GFX11-NEXT: v_not_b32_e32 v5, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_mov_b32_e32 v3, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_lshlrev_b32 v4, v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 @@ -3053,17 +3056,17 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(ptr addrspace(4) inreg %ptr, i ; GFX9-NEXT: s_cmp_eq_u32 s7, 5 ; GFX9-NEXT: s_cselect_b32 s5, s16, s13 ; GFX9-NEXT: s_cmp_eq_u32 s7, 6 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_cselect_b32 s6, s16, s14 ; GFX9-NEXT: s_cmp_eq_u32 s7, 7 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_cselect_b32 s7, s16, s15 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: v_mov_b32_e32 v4, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 @@ -3083,17 +3086,17 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(ptr addrspace(4) inreg %ptr, i ; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_movreld_b32 s8, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: s_movreld_b32 s8, s0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NEXT: v_mov_b32_e32 v3, s15 @@ -3114,8 +3117,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(ptr addrspace(4) inreg %ptr, i ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 ; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_movreld_b32 s8, s0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: v_mov_b32_e32 v3, s11 @@ -3725,30 +3728,30 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_not_b32_e32 v9, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, s12 +; GFX10-NEXT: v_mov_b32_e32 v5, s13 +; GFX10-NEXT: v_mov_b32_e32 v6, s14 +; GFX10-NEXT: v_mov_b32_e32 v7, s15 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v1, s15, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_and_or_b32 v13, v10, v9, v8 -; GFX10-NEXT: v_mov_b32_e32 v4, s12 -; GFX10-NEXT: v_mov_b32_e32 v5, s13 -; GFX10-NEXT: v_mov_b32_e32 v6, s14 -; GFX10-NEXT: v_mov_b32_e32 v7, s15 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: v_mov_b32_e32 v10, 16 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 @@ -3778,9 +3781,13 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s9 ; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 -; GFX11-NEXT: v_not_b32_e32 v9, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s8 +; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo +; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; GFX11-NEXT: v_not_b32_e32 v9, v2 +; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 @@ -3788,20 +3795,18 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v10, v1, s15, s5 -; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 -; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 +; GFX11-NEXT: v_mov_b32_e32 v1, s9 ; GFX11-NEXT: v_and_or_b32 v13, v10, v9, v8 -; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 -; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16 +; GFX11-NEXT: v_mov_b32_e32 v11, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 -; GFX11-NEXT: v_mov_b32_e32 v11, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 @@ -3973,11 +3978,11 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i ; GFX7-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 ; GFX7-NEXT: v_mov_b32_e32 v4, s16 ; GFX7-NEXT: v_mov_b32_e32 v5, s17 ; GFX7-NEXT: v_mov_b32_e32 v6, s18 ; GFX7-NEXT: v_mov_b32_e32 v7, s19 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] @@ -4014,30 +4019,30 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: v_not_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, s12 +; GFX10-NEXT: v_mov_b32_e32 v5, s13 +; GFX10-NEXT: v_mov_b32_e32 v6, s14 +; GFX10-NEXT: v_mov_b32_e32 v7, s15 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v2, s15, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_and_or_b32 v13, v10, v9, v8 -; GFX10-NEXT: v_mov_b32_e32 v4, s12 -; GFX10-NEXT: v_mov_b32_e32 v5, s13 -; GFX10-NEXT: v_mov_b32_e32 v6, s14 -; GFX10-NEXT: v_mov_b32_e32 v7, s15 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: v_mov_b32_e32 v10, 16 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 @@ -4062,37 +4067,39 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i ; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 ; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s9 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, v1, v0 -; GFX11-NEXT: v_not_b32_e32 v9, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v5, s13 +; GFX11-NEXT: v_mov_b32_e32 v1, s9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v4, s12 +; GFX11-NEXT: v_mov_b32_e32 v6, s14 +; GFX11-NEXT: v_not_b32_e32 v9, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, s11 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v7, s15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s12, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v10, v2, s15, s5 -; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v5, s13 -; GFX11-NEXT: v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v2, s10 -; GFX11-NEXT: v_mov_b32_e32 v7, s15 -; GFX11-NEXT: v_mov_b32_e32 v3, s11 +; GFX11-NEXT: v_mov_b32_e32 v2, s10 ; GFX11-NEXT: v_and_or_b32 v13, v10, v9, v8 -; GFX11-NEXT: v_mov_b32_e32 v4, s12 -; GFX11-NEXT: v_mov_b32_e32 v6, s14 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16 +; GFX11-NEXT: v_mov_b32_e32 v11, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 -; GFX11-NEXT: v_mov_b32_e32 v11, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll index 4598bcc04a505..f50670376b0c9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -1580,8 +1580,8 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: s_cselect_b32 s2, s4, s0 ; GFX7-NEXT: s_cmp_eq_u32 s3, 1 ; GFX7-NEXT: s_cselect_b32 s3, s4, s1 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -1607,9 +1607,9 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(ptr addrspace(4) inreg %ptr, i8 ; GFX10-NEXT: s_cmp_eq_u32 s2, 0 ; GFX10-NEXT: s_cselect_b32 s0, s3, s0 ; GFX10-NEXT: s_cmp_eq_u32 s2, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_cselect_b32 s1, s3, s1 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm ; @@ -1632,10 +1632,11 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: s_cmp_eq_u32 s2, 0 ; GFX11-NEXT: s_cselect_b32 s0, s3, s0 ; GFX11-NEXT: s_cmp_eq_u32 s2, 1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: s_cselect_b32 s1, s3, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, ptr addrspace(4) %ptr @@ -1991,9 +1992,9 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 ; GFX10-NEXT: v_and_or_b32 v5, v5, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2013,11 +2014,11 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_lshlrev_b32 v1, 3, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v5, s0, v0 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, s2 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 ; GFX11-NEXT: v_not_b32_e32 v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -2126,8 +2127,8 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX10-NEXT: v_not_b32_e32 v3, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 ; GFX10-NEXT: v_and_or_b32 v5, v5, v3, v2 @@ -2151,8 +2152,8 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xff -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v2, v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 ; GFX11-NEXT: v_not_b32_e32 v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -2631,8 +2632,8 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: s_cselect_b32 s2, s4, s2 ; GFX7-NEXT: s_cmp_eq_u32 s6, 3 ; GFX7-NEXT: s_cselect_b32 s3, s4, s3 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 @@ -2664,14 +2665,14 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(ptr addrspace(4) inreg %ptr, i8 ; GFX10-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10-NEXT: s_cselect_b32 s0, s4, s0 ; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_cselect_b32 s1, s4, s1 ; GFX10-NEXT: s_cmp_eq_u32 s6, 2 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_cselect_b32 s2, s4, s2 ; GFX10-NEXT: s_cmp_eq_u32 s6, 3 -; GFX10-NEXT: s_cselect_b32 s3, s4, s3 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_cselect_b32 s3, s4, s3 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm @@ -2699,15 +2700,16 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: s_cmp_eq_u32 s6, 0 ; GFX11-NEXT: s_cselect_b32 s0, s4, s0 ; GFX11-NEXT: s_cmp_eq_u32 s6, 1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_cselect_b32 s1, s4, s1 ; GFX11-NEXT: s_cmp_eq_u32 s6, 2 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_cselect_b32 s2, s4, s2 ; GFX11-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; GFX11-NEXT: s_cselect_b32 s3, s4, s3 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, ptr addrspace(4) %ptr @@ -3178,13 +3180,13 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX10-NEXT: v_not_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s9 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 @@ -3201,27 +3203,29 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 2, v0 ; GFX11-NEXT: v_and_b32_e32 v1, 3, v0 ; GFX11-NEXT: s_and_b32 s1, s4, 0xff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_lshlrev_b32 v1, 3, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, s11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v1, s1 ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 +; GFX11-NEXT: v_mov_b32_e32 v1, s9 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_not_b32_e32 v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, s10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 -; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 @@ -3360,13 +3364,13 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX10-NEXT: v_not_b32_e32 v5, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 @@ -3393,14 +3397,14 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xff -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 ; GFX11-NEXT: v_not_b32_e32 v5, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_mov_b32_e32 v3, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_lshlrev_b32 v4, v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index 533b25ef1a0c0..c53364de8df88 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -274,6 +274,8 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_s(<8 x float> inreg %v ; GFX10-LABEL: dyn_insertelement_v8f32_s_v_s: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -281,8 +283,6 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_s(<8 x float> inreg %v ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 m0, s10 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -298,18 +298,18 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_s(<8 x float> inreg %v ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_mov_b32_e32 v8, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_mov_b32 m0, s10 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4 -; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_mov_b32_e32 v6, s6 ; GFX11-NEXT: v_movreld_b32_e32 v0, v8 ; GFX11-NEXT: ; return to shader part epilog entry: @@ -686,23 +686,27 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f64_const_s_v_v: ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: s_mov_b32 s18, 0 ; GPRIDX-NEXT: s_mov_b32 s16, 0 +; GPRIDX-NEXT: s_mov_b32 s18, 0 +; GPRIDX-NEXT: s_mov_b32 s17, 0x401c0000 ; GPRIDX-NEXT: s_mov_b32 s14, 0 ; GPRIDX-NEXT: s_mov_b32 s12, 0 ; GPRIDX-NEXT: s_mov_b32 s8, 0 +; GPRIDX-NEXT: s_mov_b64 s[6:7], 2.0 ; GPRIDX-NEXT: s_mov_b64 s[4:5], 1.0 ; GPRIDX-NEXT: s_mov_b32 s19, 0x40200000 -; GPRIDX-NEXT: s_mov_b32 s17, 0x401c0000 ; GPRIDX-NEXT: s_mov_b32 s15, 0x40180000 ; GPRIDX-NEXT: s_mov_b32 s13, 0x40140000 ; GPRIDX-NEXT: s_mov_b64 s[10:11], 4.0 ; GPRIDX-NEXT: s_mov_b32 s9, 0x40080000 -; GPRIDX-NEXT: s_mov_b64 s[6:7], 2.0 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s4 ; GPRIDX-NEXT: v_mov_b32_e32 v4, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v5, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v6, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s16 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s17 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v2 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s8 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s9 ; GPRIDX-NEXT: v_mov_b32_e32 v9, s10 @@ -711,12 +715,8 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GPRIDX-NEXT: v_mov_b32_e32 v12, s13 ; GPRIDX-NEXT: v_mov_b32_e32 v13, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s16 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s17 ; GPRIDX-NEXT: v_mov_b32_e32 v17, s18 ; GPRIDX-NEXT: v_mov_b32_e32 v18, s19 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 2, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 3, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 4, v2 @@ -753,58 +753,58 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b64 s[4:5], 1.0 -; GFX10-NEXT: s_mov_b32 s18, 0 -; GFX10-NEXT: s_mov_b32 s16, 0 -; GFX10-NEXT: s_mov_b32 s14, 0 -; GFX10-NEXT: s_mov_b32 s12, 0 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: s_mov_b32 s19, 0x40200000 -; GFX10-NEXT: s_mov_b32 s17, 0x401c0000 -; GFX10-NEXT: s_mov_b32 s15, 0x40180000 -; GFX10-NEXT: s_mov_b32 s13, 0x40140000 -; GFX10-NEXT: s_mov_b64 s[10:11], 4.0 -; GFX10-NEXT: s_mov_b32 s9, 0x40080000 -; GFX10-NEXT: s_mov_b64 s[6:7], 2.0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: v_mov_b32_e32 v4, s5 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: s_mov_b64 s[6:7], 2.0 +; GFX10-NEXT: s_mov_b32 s9, 0x40080000 ; GFX10-NEXT: v_mov_b32_e32 v5, s6 ; GFX10-NEXT: v_mov_b32_e32 v6, s7 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 ; GFX10-NEXT: v_mov_b32_e32 v7, s8 ; GFX10-NEXT: v_mov_b32_e32 v8, s9 -; GFX10-NEXT: v_mov_b32_e32 v9, s10 -; GFX10-NEXT: v_mov_b32_e32 v10, s11 -; GFX10-NEXT: v_mov_b32_e32 v11, s12 -; GFX10-NEXT: v_mov_b32_e32 v12, s13 -; GFX10-NEXT: v_mov_b32_e32 v13, s14 -; GFX10-NEXT: v_mov_b32_e32 v14, s15 -; GFX10-NEXT: v_mov_b32_e32 v15, s16 -; GFX10-NEXT: v_mov_b32_e32 v16, s17 -; GFX10-NEXT: v_mov_b32_e32 v17, s18 -; GFX10-NEXT: v_mov_b32_e32 v18, s19 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v2 +; GFX10-NEXT: s_mov_b32 s12, 0 +; GFX10-NEXT: s_mov_b64 s[10:11], 4.0 +; GFX10-NEXT: s_mov_b32 s13, 0x40140000 +; GFX10-NEXT: v_mov_b32_e32 v9, s10 +; GFX10-NEXT: v_mov_b32_e32 v10, s11 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v1, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 3, v2 +; GFX10-NEXT: v_mov_b32_e32 v11, s12 +; GFX10-NEXT: v_mov_b32_e32 v12, s13 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v17, v0, s5 +; GFX10-NEXT: s_mov_b32 s14, 0 +; GFX10-NEXT: s_mov_b32 s18, 0 +; GFX10-NEXT: s_mov_b32 s16, 0 +; GFX10-NEXT: s_mov_b32 s15, 0x40180000 +; GFX10-NEXT: s_mov_b32 s19, 0x40200000 +; GFX10-NEXT: s_mov_b32 s17, 0x401c0000 +; GFX10-NEXT: v_mov_b32_e32 v13, s14 +; GFX10-NEXT: v_mov_b32_e32 v14, s15 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v1, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v2 +; GFX10-NEXT: v_mov_b32_e32 v15, s16 +; GFX10-NEXT: v_mov_b32_e32 v16, s17 +; GFX10-NEXT: v_mov_b32_e32 v17, s18 +; GFX10-NEXT: v_mov_b32_e32 v18, s19 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v1, s5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v1, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v17, v17, v0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v1, s5 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[7:10], off @@ -818,47 +818,47 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GFX11-LABEL: dyn_insertelement_v8f64_const_s_v_v: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s14, 0 -; GFX11-NEXT: s_mov_b32 s15, 0x40200000 -; GFX11-NEXT: s_mov_b32 s12, 0 -; GFX11-NEXT: s_mov_b32 s10, 0 -; GFX11-NEXT: s_mov_b32 s8, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 1.0 -; GFX11-NEXT: s_mov_b32 s13, 0x401c0000 -; GFX11-NEXT: s_mov_b32 s11, 0x40180000 -; GFX11-NEXT: s_mov_b32 s9, 0x40140000 -; GFX11-NEXT: s_mov_b64 s[6:7], 4.0 -; GFX11-NEXT: s_mov_b32 s5, 0x40080000 -; GFX11-NEXT: s_mov_b64 s[2:3], 2.0 -; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14 -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12 -; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10 -; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8 -; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6 -; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 +; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_mov_b64 s[2:3], 2.0 +; GFX11-NEXT: s_mov_b32 s5, 0x40080000 ; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 +; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 ; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v2 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_mov_b64 s[6:7], 4.0 +; GFX11-NEXT: s_mov_b32 s9, 0x40140000 +; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 +; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8 ; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v0 :: v_dual_cndmask_b32 v8, v8, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1 +; GFX11-NEXT: s_mov_b32 s10, 0 +; GFX11-NEXT: s_mov_b32 s14, 0 +; GFX11-NEXT: s_mov_b32 s12, 0 +; GFX11-NEXT: s_mov_b32 s11, 0x40180000 +; GFX11-NEXT: s_mov_b32 s15, 0x40200000 +; GFX11-NEXT: s_mov_b32 s13, 0x401c0000 +; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 +; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14 +; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12 ; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v0 :: v_dual_cndmask_b32 v12, v12, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v0 :: v_dual_cndmask_b32 v16, v16, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1 ; GFX11-NEXT: global_store_b128 v[0:1], v[3:6], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[7:10], off dlc @@ -884,24 +884,22 @@ entry: define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, double inreg %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f64_s_s_v: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s11, s13 ; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s13, s15 ; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s14 +; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s13 ; GPRIDX-NEXT: v_mov_b32_e32 v13, s12 ; GPRIDX-NEXT: v_mov_b32_e32 v12, s11 @@ -926,6 +924,8 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GPRIDX-NEXT: v_mov_b32_e32 v0, s19 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s14 ; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[12:13] ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc ; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[12:13] @@ -954,62 +954,62 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do ; ; GFX10-LABEL: dyn_insertelement_v8f64_s_s_v: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s15, s17 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: s_mov_b32 s12, s14 -; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: v_mov_b32_e32 v16, s15 -; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_mov_b32_e32 v15, s14 -; GFX10-NEXT: v_mov_b32_e32 v14, s13 -; GFX10-NEXT: v_mov_b32_e32 v13, s12 -; GFX10-NEXT: v_mov_b32_e32 v12, s11 -; GFX10-NEXT: v_mov_b32_e32 v11, s10 -; GFX10-NEXT: v_mov_b32_e32 v10, s9 -; GFX10-NEXT: v_mov_b32_e32 v9, s8 -; GFX10-NEXT: v_mov_b32_e32 v8, s7 -; GFX10-NEXT: v_mov_b32_e32 v7, s6 -; GFX10-NEXT: v_mov_b32_e32 v6, s5 -; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: v_mov_b32_e32 v4, s3 ; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s18, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s19, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 7, v0 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: v_mov_b32_e32 v8, s7 +; GFX10-NEXT: v_mov_b32_e32 v7, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s18, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s19, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v0 +; GFX10-NEXT: v_mov_b32_e32 v10, s9 +; GFX10-NEXT: v_mov_b32_e32 v9, s8 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s18, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s19, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, s18, s1 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: s_mov_b32 s15, s17 +; GFX10-NEXT: v_mov_b32_e32 v12, s11 +; GFX10-NEXT: v_mov_b32_e32 v11, s10 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s18, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, s19, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v0 +; GFX10-NEXT: v_mov_b32_e32 v16, s15 +; GFX10-NEXT: v_mov_b32_e32 v15, s14 +; GFX10-NEXT: v_mov_b32_e32 v14, s13 +; GFX10-NEXT: v_mov_b32_e32 v13, s12 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, s18, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, s19, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, s19, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 7, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, s18, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, s19, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, s18, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, s19, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, s18, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, s19, s1 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[1:4], off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[5:8], off @@ -1022,54 +1022,54 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do ; ; GFX11-LABEL: dyn_insertelement_v8f64_s_s_v: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s3, s5 -; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: s_mov_b32 s9, s11 -; GFX11-NEXT: s_mov_b32 s11, s13 -; GFX11-NEXT: s_mov_b32 s13, s15 -; GFX11-NEXT: s_mov_b32 s15, s17 ; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 -; GFX11-NEXT: s_mov_b32 s6, s8 -; GFX11-NEXT: s_mov_b32 s8, s10 -; GFX11-NEXT: s_mov_b32 s10, s12 -; GFX11-NEXT: s_mov_b32 s12, s14 -; GFX11-NEXT: s_mov_b32 s14, s16 -; GFX11-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v15, s14 -; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_dual_mov_b32 v14, s13 :: v_dual_mov_b32 v13, s12 -; GFX11-NEXT: v_dual_mov_b32 v12, s11 :: v_dual_mov_b32 v11, s10 -; GFX11-NEXT: v_dual_mov_b32 v10, s9 :: v_dual_mov_b32 v9, s8 -; GFX11-NEXT: v_dual_mov_b32 v8, s7 :: v_dual_mov_b32 v7, s6 -; GFX11-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v5, s4 +; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX11-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v5, s4 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s18, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s19, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v0 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: v_dual_mov_b32 v8, s7 :: v_dual_mov_b32 v7, s6 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s18, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s19, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v0 +; GFX11-NEXT: v_dual_mov_b32 v10, s9 :: v_dual_mov_b32 v9, s8 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s18, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s19, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, s18, s1 +; GFX11-NEXT: s_mov_b32 s10, s12 +; GFX11-NEXT: s_mov_b32 s11, s13 +; GFX11-NEXT: s_mov_b32 s12, s14 +; GFX11-NEXT: s_mov_b32 s13, s15 +; GFX11-NEXT: s_mov_b32 s14, s16 +; GFX11-NEXT: s_mov_b32 s15, s17 +; GFX11-NEXT: v_dual_mov_b32 v12, s11 :: v_dual_mov_b32 v11, s10 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s18, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s19, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v15, s14 +; GFX11-NEXT: v_dual_mov_b32 v14, s13 :: v_dual_mov_b32 v13, s12 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, s18, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s19, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, s19, s1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, s18, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, s19, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, s18, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, s19, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, s18, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, s19, s1 ; GFX11-NEXT: global_store_b128 v[0:1], v[1:4], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[5:8], off dlc @@ -1095,22 +1095,22 @@ entry: define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, double %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f64_s_v_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s11, s13 ; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s13, s15 ; GPRIDX-NEXT: s_mov_b32 s14, s16 +; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: v_mov_b32_e32 v17, s15 ; GPRIDX-NEXT: v_mov_b32_e32 v16, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v15, s13 @@ -1144,25 +1144,25 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do ; ; GFX10-LABEL: dyn_insertelement_v8f64_s_v_s: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s15, s17 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 ; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: v_mov_b32_e32 v17, s15 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_mov_b32 s15, s17 ; GFX10-NEXT: s_lshl_b32 m0, s18, 1 +; GFX10-NEXT: v_mov_b32_e32 v17, s15 ; GFX10-NEXT: v_mov_b32_e32 v16, s14 ; GFX10-NEXT: v_mov_b32_e32 v15, s13 ; GFX10-NEXT: v_mov_b32_e32 v14, s12 @@ -1191,25 +1191,25 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do ; ; GFX11-LABEL: dyn_insertelement_v8f64_s_v_s: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s3, s5 -; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: s_mov_b32 s9, s11 -; GFX11-NEXT: s_mov_b32 s11, s13 -; GFX11-NEXT: s_mov_b32 s13, s15 -; GFX11-NEXT: s_mov_b32 s15, s17 ; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s9, s11 ; GFX11-NEXT: s_mov_b32 s10, s12 +; GFX11-NEXT: s_mov_b32 s11, s13 ; GFX11-NEXT: s_mov_b32 s12, s14 +; GFX11-NEXT: s_mov_b32 s13, s15 ; GFX11-NEXT: s_mov_b32 s14, s16 -; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_mov_b32 s15, s17 ; GFX11-NEXT: s_lshl_b32 m0, s18, 1 +; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14 ; GFX11-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12 ; GFX11-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10 ; GFX11-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8 @@ -1303,26 +1303,32 @@ entry: define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, double %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f64_s_v_v: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s11, s13 ; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s13, s15 ; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 +; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 ; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 ; GPRIDX-NEXT: v_mov_b32_e32 v13, s10 ; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 @@ -1331,12 +1337,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do ; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 -; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v2 @@ -1371,62 +1371,62 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do ; ; GFX10-LABEL: dyn_insertelement_v8f64_s_v_v: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s15, s17 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: s_mov_b32 s12, s14 -; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: v_mov_b32_e32 v18, s15 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_mov_b32_e32 v17, s14 -; GFX10-NEXT: v_mov_b32_e32 v16, s13 -; GFX10-NEXT: v_mov_b32_e32 v15, s12 -; GFX10-NEXT: v_mov_b32_e32 v14, s11 -; GFX10-NEXT: v_mov_b32_e32 v13, s10 -; GFX10-NEXT: v_mov_b32_e32 v12, s9 -; GFX10-NEXT: v_mov_b32_e32 v11, s8 -; GFX10-NEXT: v_mov_b32_e32 v10, s7 -; GFX10-NEXT: v_mov_b32_e32 v9, s6 -; GFX10-NEXT: v_mov_b32_e32 v8, s5 -; GFX10-NEXT: v_mov_b32_e32 v7, s4 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: v_mov_b32_e32 v6, s3 ; GFX10-NEXT: v_mov_b32_e32 v5, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 +; GFX10-NEXT: v_mov_b32_e32 v8, s5 +; GFX10-NEXT: v_mov_b32_e32 v7, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 7, v2 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: v_mov_b32_e32 v10, s7 +; GFX10-NEXT: v_mov_b32_e32 v9, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 +; GFX10-NEXT: v_mov_b32_e32 v12, s9 +; GFX10-NEXT: v_mov_b32_e32 v11, s8 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: s_mov_b32 s15, s17 +; GFX10-NEXT: v_mov_b32_e32 v14, s11 +; GFX10-NEXT: v_mov_b32_e32 v13, s10 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 +; GFX10-NEXT: v_mov_b32_e32 v18, s15 +; GFX10-NEXT: v_mov_b32_e32 v17, s14 +; GFX10-NEXT: v_mov_b32_e32 v16, s13 +; GFX10-NEXT: v_mov_b32_e32 v15, s12 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v1, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[7:10], off @@ -1439,50 +1439,50 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do ; ; GFX11-LABEL: dyn_insertelement_v8f64_s_v_v: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s3, s5 -; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: s_mov_b32 s9, s11 -; GFX11-NEXT: s_mov_b32 s11, s13 -; GFX11-NEXT: s_mov_b32 s13, s15 -; GFX11-NEXT: s_mov_b32 s15, s17 ; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 -; GFX11-NEXT: s_mov_b32 s6, s8 -; GFX11-NEXT: s_mov_b32 s8, s10 -; GFX11-NEXT: s_mov_b32 s10, s12 -; GFX11-NEXT: s_mov_b32 s12, s14 -; GFX11-NEXT: s_mov_b32 s14, s16 -; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14 -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12 -; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10 -; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8 -; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6 -; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 +; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 +; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 ; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v2 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 +; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8 ; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v0 :: v_dual_cndmask_b32 v8, v8, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1 +; GFX11-NEXT: s_mov_b32 s10, s12 +; GFX11-NEXT: s_mov_b32 s11, s13 +; GFX11-NEXT: s_mov_b32 s12, s14 +; GFX11-NEXT: s_mov_b32 s13, s15 +; GFX11-NEXT: s_mov_b32 s14, s16 +; GFX11-NEXT: s_mov_b32 s15, s17 +; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 +; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14 +; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12 ; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v0 :: v_dual_cndmask_b32 v12, v12, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v0 :: v_dual_cndmask_b32 v16, v16, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1 ; GFX11-NEXT: global_store_b128 v[0:1], v[3:6], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[7:10], off dlc @@ -2564,6 +2564,8 @@ define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_s_v_s(<9 x float> inreg %v ; GFX10-LABEL: dyn_insertelement_v9f32_s_v_s: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -2572,8 +2574,6 @@ define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_s_v_s(<9 x float> inreg %v ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 m0, s11 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -2589,6 +2589,7 @@ define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_s_v_s(<9 x float> inreg %v ; GFX11-LABEL: dyn_insertelement_v9f32_s_v_s: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 @@ -2597,7 +2598,6 @@ define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_s_v_s(<9 x float> inreg %v ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: s_mov_b32 s8, s10 -; GFX11-NEXT: v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: s_mov_b32 m0, s11 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 ; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s4 @@ -2794,6 +2794,8 @@ define amdgpu_ps <10 x float> @dyn_insertelement_v10f32_s_v_s(<10 x float> inreg ; GFX10-LABEL: dyn_insertelement_v10f32_s_v_s: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -2803,8 +2805,6 @@ define amdgpu_ps <10 x float> @dyn_insertelement_v10f32_s_v_s(<10 x float> inreg ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 m0, s12 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -2822,21 +2822,21 @@ define amdgpu_ps <10 x float> @dyn_insertelement_v10f32_s_v_s(<10 x float> inreg ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_dual_mov_b32 v10, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: s_mov_b32 s8, s10 ; GFX11-NEXT: s_mov_b32 s9, s11 -; GFX11-NEXT: v_mov_b32_e32 v10, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_mov_b32 m0, s12 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4 -; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6 -; GFX11-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v8, s8 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_mov_b32_e32 v8, s8 ; GFX11-NEXT: v_movreld_b32_e32 v0, v10 ; GFX11-NEXT: ; return to shader part epilog entry: @@ -3041,6 +3041,8 @@ define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_s_v_s(<11 x float> inreg ; GFX10-LABEL: dyn_insertelement_v11f32_s_v_s: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v11, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -3051,8 +3053,6 @@ define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_s_v_s(<11 x float> inreg ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: v_mov_b32_e32 v11, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 m0, s13 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -3070,6 +3070,7 @@ define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_s_v_s(<11 x float> inreg ; GFX11-LABEL: dyn_insertelement_v11f32_s_v_s: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 @@ -3080,7 +3081,6 @@ define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_s_v_s(<11 x float> inreg ; GFX11-NEXT: s_mov_b32 s8, s10 ; GFX11-NEXT: s_mov_b32 s9, s11 ; GFX11-NEXT: s_mov_b32 s10, s12 -; GFX11-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: s_mov_b32 m0, s13 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 ; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s4 @@ -3304,6 +3304,8 @@ define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_s_v_s(<12 x float> inreg ; GFX10-LABEL: dyn_insertelement_v12f32_s_v_s: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v12, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -3315,8 +3317,6 @@ define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_s_v_s(<12 x float> inreg ; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: v_mov_b32_e32 v12, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 m0, s14 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -3336,8 +3336,10 @@ define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_s_v_s(<12 x float> inreg ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_dual_mov_b32 v12, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 @@ -3346,14 +3348,12 @@ define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_s_v_s(<12 x float> inreg ; GFX11-NEXT: s_mov_b32 s9, s11 ; GFX11-NEXT: s_mov_b32 s10, s12 ; GFX11-NEXT: s_mov_b32 s11, s13 -; GFX11-NEXT: v_mov_b32_e32 v12, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_mov_b32 m0, s14 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4 -; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6 -; GFX11-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v8, s8 -; GFX11-NEXT: v_dual_mov_b32 v11, s11 :: v_dual_mov_b32 v10, s10 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_mov_b32_e32 v10, s10 ; GFX11-NEXT: v_movreld_b32_e32 v0, v12 ; GFX11-NEXT: ; return to shader part epilog entry: @@ -4075,22 +4075,22 @@ entry: define amdgpu_ps <16 x i32> @dyn_insertelement_v16i32_s_v_s(<16 x i32> inreg %vec, i32 %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v16i32_s_v_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s11, s13 ; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s13, s15 ; GPRIDX-NEXT: s_mov_b32 s14, s16 +; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: v_mov_b32_e32 v16, s15 ; GPRIDX-NEXT: v_mov_b32_e32 v15, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s13 @@ -4130,25 +4130,25 @@ define amdgpu_ps <16 x i32> @dyn_insertelement_v16i32_s_v_s(<16 x i32> inreg %ve ; ; GFX10-LABEL: dyn_insertelement_v16i32_s_v_s: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s15, s17 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 ; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: v_mov_b32_e32 v16, s15 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: s_mov_b32 s15, s17 ; GFX10-NEXT: s_mov_b32 m0, s18 +; GFX10-NEXT: v_mov_b32_e32 v16, s15 ; GFX10-NEXT: v_mov_b32_e32 v15, s14 ; GFX10-NEXT: v_mov_b32_e32 v14, s13 ; GFX10-NEXT: v_mov_b32_e32 v13, s12 @@ -4184,25 +4184,25 @@ define amdgpu_ps <16 x i32> @dyn_insertelement_v16i32_s_v_s(<16 x i32> inreg %ve ; ; GFX11-LABEL: dyn_insertelement_v16i32_s_v_s: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s3, s5 -; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: s_mov_b32 s9, s11 -; GFX11-NEXT: s_mov_b32 s11, s13 -; GFX11-NEXT: s_mov_b32 s13, s15 -; GFX11-NEXT: s_mov_b32 s15, s17 ; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 ; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s9, s11 ; GFX11-NEXT: s_mov_b32 s10, s12 +; GFX11-NEXT: s_mov_b32 s11, s13 ; GFX11-NEXT: s_mov_b32 s12, s14 +; GFX11-NEXT: s_mov_b32 s13, s15 ; GFX11-NEXT: s_mov_b32 s14, s16 -; GFX11-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v15, s14 -; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: s_mov_b32 s15, s17 ; GFX11-NEXT: s_mov_b32 m0, s18 +; GFX11-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v15, s14 ; GFX11-NEXT: v_dual_mov_b32 v14, s13 :: v_dual_mov_b32 v13, s12 ; GFX11-NEXT: v_dual_mov_b32 v12, s11 :: v_dual_mov_b32 v11, s10 ; GFX11-NEXT: v_dual_mov_b32 v10, s9 :: v_dual_mov_b32 v9, s8 @@ -4276,6 +4276,8 @@ define amdgpu_ps <16 x float> @dyn_insertelement_v16f32_s_v_s(<16 x float> inreg ; GFX10-LABEL: dyn_insertelement_v16f32_s_v_s: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v16, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -4291,8 +4293,6 @@ define amdgpu_ps <16 x float> @dyn_insertelement_v16f32_s_v_s(<16 x float> inreg ; GFX10-NEXT: s_mov_b32 s13, s15 ; GFX10-NEXT: s_mov_b32 s14, s16 ; GFX10-NEXT: s_mov_b32 s15, s17 -; GFX10-NEXT: v_mov_b32_e32 v16, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 m0, s18 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -4316,8 +4316,10 @@ define amdgpu_ps <16 x float> @dyn_insertelement_v16f32_s_v_s(<16 x float> inreg ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 @@ -4330,16 +4332,14 @@ define amdgpu_ps <16 x float> @dyn_insertelement_v16f32_s_v_s(<16 x float> inreg ; GFX11-NEXT: s_mov_b32 s13, s15 ; GFX11-NEXT: s_mov_b32 s14, s16 ; GFX11-NEXT: s_mov_b32 s15, s17 -; GFX11-NEXT: v_mov_b32_e32 v16, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_mov_b32 m0, s18 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4 -; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6 -; GFX11-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v8, s8 -; GFX11-NEXT: v_dual_mov_b32 v11, s11 :: v_dual_mov_b32 v10, s10 -; GFX11-NEXT: v_dual_mov_b32 v13, s13 :: v_dual_mov_b32 v12, s12 -; GFX11-NEXT: v_dual_mov_b32 v15, s15 :: v_dual_mov_b32 v14, s14 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_mov_b32_e32 v14, s14 ; GFX11-NEXT: v_movreld_b32_e32 v0, v16 ; GFX11-NEXT: ; return to shader part epilog entry: @@ -4423,6 +4423,8 @@ define amdgpu_ps <32 x float> @dyn_insertelement_v32f32_s_v_s(<32 x float> inreg ; GFX10-LABEL: dyn_insertelement_v32f32_s_v_s: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v32, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -4454,8 +4456,6 @@ define amdgpu_ps <32 x float> @dyn_insertelement_v32f32_s_v_s(<32 x float> inreg ; GFX10-NEXT: s_mov_b32 s29, s31 ; GFX10-NEXT: s_mov_b32 s31, s33 ; GFX10-NEXT: s_mov_b32 s30, s32 -; GFX10-NEXT: v_mov_b32_e32 v32, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 m0, s34 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -4495,8 +4495,10 @@ define amdgpu_ps <32 x float> @dyn_insertelement_v32f32_s_v_s(<32 x float> inreg ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_dual_mov_b32 v32, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 @@ -4525,24 +4527,22 @@ define amdgpu_ps <32 x float> @dyn_insertelement_v32f32_s_v_s(<32 x float> inreg ; GFX11-NEXT: s_mov_b32 s29, s31 ; GFX11-NEXT: s_mov_b32 s31, s33 ; GFX11-NEXT: s_mov_b32 s30, s32 -; GFX11-NEXT: v_mov_b32_e32 v32, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_mov_b32 m0, s34 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4 -; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6 -; GFX11-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v8, s8 -; GFX11-NEXT: v_dual_mov_b32 v11, s11 :: v_dual_mov_b32 v10, s10 -; GFX11-NEXT: v_dual_mov_b32 v13, s13 :: v_dual_mov_b32 v12, s12 -; GFX11-NEXT: v_dual_mov_b32 v15, s15 :: v_dual_mov_b32 v14, s14 -; GFX11-NEXT: v_dual_mov_b32 v17, s17 :: v_dual_mov_b32 v16, s16 -; GFX11-NEXT: v_dual_mov_b32 v19, s19 :: v_dual_mov_b32 v18, s18 -; GFX11-NEXT: v_dual_mov_b32 v21, s21 :: v_dual_mov_b32 v20, s20 -; GFX11-NEXT: v_dual_mov_b32 v23, s23 :: v_dual_mov_b32 v22, s22 -; GFX11-NEXT: v_dual_mov_b32 v25, s25 :: v_dual_mov_b32 v24, s24 -; GFX11-NEXT: v_dual_mov_b32 v27, s27 :: v_dual_mov_b32 v26, s26 -; GFX11-NEXT: v_dual_mov_b32 v29, s29 :: v_dual_mov_b32 v28, s28 -; GFX11-NEXT: v_dual_mov_b32 v31, s31 :: v_dual_mov_b32 v30, s30 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v21, s21 +; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v23, s23 +; GFX11-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v25, s25 +; GFX11-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v27, s27 +; GFX11-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v29, s29 +; GFX11-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v31, s31 +; GFX11-NEXT: v_mov_b32_e32 v30, s30 ; GFX11-NEXT: v_movreld_b32_e32 v0, v32 ; GFX11-NEXT: ; return to shader part epilog entry: @@ -4553,40 +4553,40 @@ entry: define amdgpu_ps <16 x i64> @dyn_insertelement_v16i64_s_v_s(<16 x i64> inreg %vec, i64 %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v16i64_s_v_s: ; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 ; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 ; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 ; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s8, s10 ; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: s_mov_b32 s10, s12 ; GPRIDX-NEXT: s_mov_b32 s11, s13 +; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s13, s15 +; GPRIDX-NEXT: s_mov_b32 s14, s16 ; GPRIDX-NEXT: s_mov_b32 s15, s17 +; GPRIDX-NEXT: s_mov_b32 s16, s18 ; GPRIDX-NEXT: s_mov_b32 s17, s19 +; GPRIDX-NEXT: s_mov_b32 s18, s20 ; GPRIDX-NEXT: s_mov_b32 s19, s21 +; GPRIDX-NEXT: s_mov_b32 s20, s22 ; GPRIDX-NEXT: s_mov_b32 s21, s23 +; GPRIDX-NEXT: s_mov_b32 s22, s24 ; GPRIDX-NEXT: s_mov_b32 s23, s25 +; GPRIDX-NEXT: s_mov_b32 s24, s26 ; GPRIDX-NEXT: s_mov_b32 s25, s27 +; GPRIDX-NEXT: s_mov_b32 s26, s28 ; GPRIDX-NEXT: s_mov_b32 s27, s29 +; GPRIDX-NEXT: s_mov_b32 s28, s30 ; GPRIDX-NEXT: s_mov_b32 s29, s31 ; GPRIDX-NEXT: s_mov_b32 s31, s33 -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: s_mov_b32 s10, s12 -; GPRIDX-NEXT: s_mov_b32 s12, s14 -; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: s_mov_b32 s16, s18 -; GPRIDX-NEXT: s_mov_b32 s18, s20 -; GPRIDX-NEXT: s_mov_b32 s20, s22 -; GPRIDX-NEXT: s_mov_b32 s22, s24 -; GPRIDX-NEXT: s_mov_b32 s24, s26 -; GPRIDX-NEXT: s_mov_b32 s26, s28 -; GPRIDX-NEXT: s_mov_b32 s28, s30 ; GPRIDX-NEXT: s_mov_b32 s30, s32 -; GPRIDX-NEXT: v_mov_b32_e32 v33, s31 ; GPRIDX-NEXT: s_lshl_b32 s33, s34, 1 +; GPRIDX-NEXT: v_mov_b32_e32 v33, s31 ; GPRIDX-NEXT: v_mov_b32_e32 v32, s30 ; GPRIDX-NEXT: v_mov_b32_e32 v31, s29 ; GPRIDX-NEXT: v_mov_b32_e32 v30, s28 @@ -4658,41 +4658,41 @@ define amdgpu_ps <16 x i64> @dyn_insertelement_v16i64_s_v_s(<16 x i64> inreg %ve ; ; GFX10-LABEL: dyn_insertelement_v16i64_s_v_s: ; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 ; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 s14, s16 ; GFX10-NEXT: s_mov_b32 s15, s17 +; GFX10-NEXT: s_mov_b32 s16, s18 ; GFX10-NEXT: s_mov_b32 s17, s19 +; GFX10-NEXT: s_mov_b32 s18, s20 ; GFX10-NEXT: s_mov_b32 s19, s21 +; GFX10-NEXT: s_mov_b32 s20, s22 ; GFX10-NEXT: s_mov_b32 s21, s23 +; GFX10-NEXT: s_mov_b32 s22, s24 ; GFX10-NEXT: s_mov_b32 s23, s25 +; GFX10-NEXT: s_mov_b32 s24, s26 ; GFX10-NEXT: s_mov_b32 s25, s27 +; GFX10-NEXT: s_mov_b32 s26, s28 ; GFX10-NEXT: s_mov_b32 s27, s29 +; GFX10-NEXT: s_mov_b32 s28, s30 ; GFX10-NEXT: s_mov_b32 s29, s31 ; GFX10-NEXT: s_mov_b32 s31, s33 -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: s_mov_b32 s12, s14 -; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: s_mov_b32 s16, s18 -; GFX10-NEXT: s_mov_b32 s18, s20 -; GFX10-NEXT: s_mov_b32 s20, s22 -; GFX10-NEXT: s_mov_b32 s22, s24 -; GFX10-NEXT: s_mov_b32 s24, s26 -; GFX10-NEXT: s_mov_b32 s26, s28 -; GFX10-NEXT: s_mov_b32 s28, s30 ; GFX10-NEXT: s_mov_b32 s30, s32 -; GFX10-NEXT: v_mov_b32_e32 v33, s31 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_lshl_b32 m0, s34, 1 +; GFX10-NEXT: v_mov_b32_e32 v33, s31 ; GFX10-NEXT: v_mov_b32_e32 v32, s30 ; GFX10-NEXT: v_mov_b32_e32 v31, s29 ; GFX10-NEXT: v_mov_b32_e32 v30, s28 @@ -4761,41 +4761,41 @@ define amdgpu_ps <16 x i64> @dyn_insertelement_v16i64_s_v_s(<16 x i64> inreg %ve ; ; GFX11-LABEL: dyn_insertelement_v16i64_s_v_s: ; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s8, s10 ; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: s_mov_b32 s10, s12 ; GFX11-NEXT: s_mov_b32 s11, s13 +; GFX11-NEXT: s_mov_b32 s12, s14 ; GFX11-NEXT: s_mov_b32 s13, s15 +; GFX11-NEXT: s_mov_b32 s14, s16 ; GFX11-NEXT: s_mov_b32 s15, s17 +; GFX11-NEXT: s_mov_b32 s16, s18 ; GFX11-NEXT: s_mov_b32 s17, s19 +; GFX11-NEXT: s_mov_b32 s18, s20 ; GFX11-NEXT: s_mov_b32 s19, s21 +; GFX11-NEXT: s_mov_b32 s20, s22 ; GFX11-NEXT: s_mov_b32 s21, s23 +; GFX11-NEXT: s_mov_b32 s22, s24 ; GFX11-NEXT: s_mov_b32 s23, s25 +; GFX11-NEXT: s_mov_b32 s24, s26 ; GFX11-NEXT: s_mov_b32 s25, s27 +; GFX11-NEXT: s_mov_b32 s26, s28 ; GFX11-NEXT: s_mov_b32 s27, s29 +; GFX11-NEXT: s_mov_b32 s28, s30 ; GFX11-NEXT: s_mov_b32 s29, s31 ; GFX11-NEXT: s_mov_b32 s31, s33 -; GFX11-NEXT: s_mov_b32 s0, s2 -; GFX11-NEXT: s_mov_b32 s2, s4 -; GFX11-NEXT: s_mov_b32 s4, s6 -; GFX11-NEXT: s_mov_b32 s6, s8 -; GFX11-NEXT: s_mov_b32 s8, s10 -; GFX11-NEXT: s_mov_b32 s10, s12 -; GFX11-NEXT: s_mov_b32 s12, s14 -; GFX11-NEXT: s_mov_b32 s14, s16 -; GFX11-NEXT: s_mov_b32 s16, s18 -; GFX11-NEXT: s_mov_b32 s18, s20 -; GFX11-NEXT: s_mov_b32 s20, s22 -; GFX11-NEXT: s_mov_b32 s22, s24 -; GFX11-NEXT: s_mov_b32 s24, s26 -; GFX11-NEXT: s_mov_b32 s26, s28 -; GFX11-NEXT: s_mov_b32 s28, s30 ; GFX11-NEXT: s_mov_b32 s30, s32 -; GFX11-NEXT: v_dual_mov_b32 v33, s31 :: v_dual_mov_b32 v32, s30 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: s_lshl_b32 m0, s34, 1 +; GFX11-NEXT: v_dual_mov_b32 v33, s31 :: v_dual_mov_b32 v32, s30 ; GFX11-NEXT: v_dual_mov_b32 v31, s29 :: v_dual_mov_b32 v30, s28 ; GFX11-NEXT: v_dual_mov_b32 v29, s27 :: v_dual_mov_b32 v28, s26 ; GFX11-NEXT: v_dual_mov_b32 v27, s25 :: v_dual_mov_b32 v26, s24 @@ -4853,40 +4853,40 @@ entry: define amdgpu_ps <16 x double> @dyn_insertelement_v16f64_s_v_s(<16 x double> inreg %vec, double %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v16f64_s_v_s: ; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 ; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 ; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 ; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s8, s10 ; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: s_mov_b32 s10, s12 ; GPRIDX-NEXT: s_mov_b32 s11, s13 +; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s13, s15 +; GPRIDX-NEXT: s_mov_b32 s14, s16 ; GPRIDX-NEXT: s_mov_b32 s15, s17 +; GPRIDX-NEXT: s_mov_b32 s16, s18 ; GPRIDX-NEXT: s_mov_b32 s17, s19 +; GPRIDX-NEXT: s_mov_b32 s18, s20 ; GPRIDX-NEXT: s_mov_b32 s19, s21 +; GPRIDX-NEXT: s_mov_b32 s20, s22 ; GPRIDX-NEXT: s_mov_b32 s21, s23 +; GPRIDX-NEXT: s_mov_b32 s22, s24 ; GPRIDX-NEXT: s_mov_b32 s23, s25 +; GPRIDX-NEXT: s_mov_b32 s24, s26 ; GPRIDX-NEXT: s_mov_b32 s25, s27 +; GPRIDX-NEXT: s_mov_b32 s26, s28 ; GPRIDX-NEXT: s_mov_b32 s27, s29 +; GPRIDX-NEXT: s_mov_b32 s28, s30 ; GPRIDX-NEXT: s_mov_b32 s29, s31 ; GPRIDX-NEXT: s_mov_b32 s31, s33 -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: s_mov_b32 s10, s12 -; GPRIDX-NEXT: s_mov_b32 s12, s14 -; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: s_mov_b32 s16, s18 -; GPRIDX-NEXT: s_mov_b32 s18, s20 -; GPRIDX-NEXT: s_mov_b32 s20, s22 -; GPRIDX-NEXT: s_mov_b32 s22, s24 -; GPRIDX-NEXT: s_mov_b32 s24, s26 -; GPRIDX-NEXT: s_mov_b32 s26, s28 -; GPRIDX-NEXT: s_mov_b32 s28, s30 ; GPRIDX-NEXT: s_mov_b32 s30, s32 -; GPRIDX-NEXT: v_mov_b32_e32 v33, s31 ; GPRIDX-NEXT: s_lshl_b32 s33, s34, 1 +; GPRIDX-NEXT: v_mov_b32_e32 v33, s31 ; GPRIDX-NEXT: v_mov_b32_e32 v32, s30 ; GPRIDX-NEXT: v_mov_b32_e32 v31, s29 ; GPRIDX-NEXT: v_mov_b32_e32 v30, s28 @@ -4958,41 +4958,41 @@ define amdgpu_ps <16 x double> @dyn_insertelement_v16f64_s_v_s(<16 x double> inr ; ; GFX10-LABEL: dyn_insertelement_v16f64_s_v_s: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s15, s17 -; GFX10-NEXT: s_mov_b32 s17, s19 -; GFX10-NEXT: s_mov_b32 s19, s21 -; GFX10-NEXT: s_mov_b32 s21, s23 -; GFX10-NEXT: s_mov_b32 s23, s25 -; GFX10-NEXT: s_mov_b32 s25, s27 -; GFX10-NEXT: s_mov_b32 s27, s29 -; GFX10-NEXT: s_mov_b32 s29, s31 -; GFX10-NEXT: s_mov_b32 s31, s33 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 ; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: s_mov_b32 s15, s17 ; GFX10-NEXT: s_mov_b32 s16, s18 +; GFX10-NEXT: s_mov_b32 s17, s19 ; GFX10-NEXT: s_mov_b32 s18, s20 +; GFX10-NEXT: s_mov_b32 s19, s21 ; GFX10-NEXT: s_mov_b32 s20, s22 +; GFX10-NEXT: s_mov_b32 s21, s23 ; GFX10-NEXT: s_mov_b32 s22, s24 +; GFX10-NEXT: s_mov_b32 s23, s25 ; GFX10-NEXT: s_mov_b32 s24, s26 +; GFX10-NEXT: s_mov_b32 s25, s27 ; GFX10-NEXT: s_mov_b32 s26, s28 +; GFX10-NEXT: s_mov_b32 s27, s29 ; GFX10-NEXT: s_mov_b32 s28, s30 +; GFX10-NEXT: s_mov_b32 s29, s31 +; GFX10-NEXT: s_mov_b32 s31, s33 ; GFX10-NEXT: s_mov_b32 s30, s32 -; GFX10-NEXT: v_mov_b32_e32 v33, s31 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_lshl_b32 m0, s34, 1 +; GFX10-NEXT: v_mov_b32_e32 v33, s31 ; GFX10-NEXT: v_mov_b32_e32 v32, s30 ; GFX10-NEXT: v_mov_b32_e32 v31, s29 ; GFX10-NEXT: v_mov_b32_e32 v30, s28 @@ -5061,41 +5061,41 @@ define amdgpu_ps <16 x double> @dyn_insertelement_v16f64_s_v_s(<16 x double> inr ; ; GFX11-LABEL: dyn_insertelement_v16f64_s_v_s: ; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s8, s10 ; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: s_mov_b32 s10, s12 ; GFX11-NEXT: s_mov_b32 s11, s13 +; GFX11-NEXT: s_mov_b32 s12, s14 ; GFX11-NEXT: s_mov_b32 s13, s15 +; GFX11-NEXT: s_mov_b32 s14, s16 ; GFX11-NEXT: s_mov_b32 s15, s17 +; GFX11-NEXT: s_mov_b32 s16, s18 ; GFX11-NEXT: s_mov_b32 s17, s19 +; GFX11-NEXT: s_mov_b32 s18, s20 ; GFX11-NEXT: s_mov_b32 s19, s21 +; GFX11-NEXT: s_mov_b32 s20, s22 ; GFX11-NEXT: s_mov_b32 s21, s23 +; GFX11-NEXT: s_mov_b32 s22, s24 ; GFX11-NEXT: s_mov_b32 s23, s25 +; GFX11-NEXT: s_mov_b32 s24, s26 ; GFX11-NEXT: s_mov_b32 s25, s27 +; GFX11-NEXT: s_mov_b32 s26, s28 ; GFX11-NEXT: s_mov_b32 s27, s29 +; GFX11-NEXT: s_mov_b32 s28, s30 ; GFX11-NEXT: s_mov_b32 s29, s31 ; GFX11-NEXT: s_mov_b32 s31, s33 -; GFX11-NEXT: s_mov_b32 s0, s2 -; GFX11-NEXT: s_mov_b32 s2, s4 -; GFX11-NEXT: s_mov_b32 s4, s6 -; GFX11-NEXT: s_mov_b32 s6, s8 -; GFX11-NEXT: s_mov_b32 s8, s10 -; GFX11-NEXT: s_mov_b32 s10, s12 -; GFX11-NEXT: s_mov_b32 s12, s14 -; GFX11-NEXT: s_mov_b32 s14, s16 -; GFX11-NEXT: s_mov_b32 s16, s18 -; GFX11-NEXT: s_mov_b32 s18, s20 -; GFX11-NEXT: s_mov_b32 s20, s22 -; GFX11-NEXT: s_mov_b32 s22, s24 -; GFX11-NEXT: s_mov_b32 s24, s26 -; GFX11-NEXT: s_mov_b32 s26, s28 -; GFX11-NEXT: s_mov_b32 s28, s30 ; GFX11-NEXT: s_mov_b32 s30, s32 -; GFX11-NEXT: v_dual_mov_b32 v33, s31 :: v_dual_mov_b32 v32, s30 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: s_lshl_b32 m0, s34, 1 +; GFX11-NEXT: v_dual_mov_b32 v33, s31 :: v_dual_mov_b32 v32, s30 ; GFX11-NEXT: v_dual_mov_b32 v31, s29 :: v_dual_mov_b32 v30, s28 ; GFX11-NEXT: v_dual_mov_b32 v29, s27 :: v_dual_mov_b32 v28, s26 ; GFX11-NEXT: v_dual_mov_b32 v27, s25 :: v_dual_mov_b32 v26, s24 @@ -5498,8 +5498,6 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_s(<7 x double> inreg ; GPRIDX-NEXT: s_mov_b32 s11, s13 ; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v15, s13 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s12 ; GPRIDX-NEXT: v_mov_b32_e32 v13, s11 @@ -5539,6 +5537,7 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_s(<7 x double> inreg ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 @@ -5551,10 +5550,7 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_s(<7 x double> inreg ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: s_mov_b32 s12, s14 ; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: v_mov_b32_e32 v17, s15 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_lshl_b32 m0, s16, 1 -; GFX10-NEXT: v_mov_b32_e32 v16, s14 ; GFX10-NEXT: v_mov_b32_e32 v15, s13 ; GFX10-NEXT: v_mov_b32_e32 v14, s12 ; GFX10-NEXT: v_mov_b32_e32 v13, s11 @@ -5590,6 +5586,7 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_s(<7 x double> inreg ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 @@ -5602,8 +5599,6 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_s(<7 x double> inreg ; GFX11-NEXT: s_mov_b32 s11, s13 ; GFX11-NEXT: s_mov_b32 s12, s14 ; GFX11-NEXT: s_mov_b32 s13, s15 -; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: s_lshl_b32 m0, s16, 1 ; GFX11-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12 ; GFX11-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10 @@ -5650,8 +5645,6 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg ; GPRIDX-NEXT: s_mov_b32 s11, s13 ; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 ; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 @@ -5709,57 +5702,55 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, s2 +; GFX10-NEXT: v_mov_b32_e32 v6, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v9, s6 +; GFX10-NEXT: v_mov_b32_e32 v10, s7 +; GFX10-NEXT: v_mov_b32_e32 v8, s5 +; GFX10-NEXT: v_mov_b32_e32 v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: s_mov_b32 s12, s14 ; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: v_mov_b32_e32 v18, s15 -; GFX10-NEXT: v_mov_b32_e32 v17, s14 -; GFX10-NEXT: v_mov_b32_e32 v16, s13 ; GFX10-NEXT: v_mov_b32_e32 v15, s12 +; GFX10-NEXT: v_mov_b32_e32 v16, s13 ; GFX10-NEXT: v_mov_b32_e32 v14, s11 ; GFX10-NEXT: v_mov_b32_e32 v13, s10 ; GFX10-NEXT: v_mov_b32_e32 v12, s9 ; GFX10-NEXT: v_mov_b32_e32 v11, s8 -; GFX10-NEXT: v_mov_b32_e32 v10, s7 -; GFX10-NEXT: v_mov_b32_e32 v9, s6 -; GFX10-NEXT: v_mov_b32_e32 v8, s5 -; GFX10-NEXT: v_mov_b32_e32 v7, s4 -; GFX10-NEXT: v_mov_b32_e32 v6, s3 -; GFX10-NEXT: v_mov_b32_e32 v5, s2 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 6, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 6, v2 ; GFX10-NEXT: v_readfirstlane_b32 s2, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, v12, v1, vcc_lo -; GFX10-NEXT: v_readfirstlane_b32 s3, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v12, v13, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v13, v14, v1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v0, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v16, v1, s1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v3 ; GFX10-NEXT: v_readfirstlane_b32 s1, v4 +; GFX10-NEXT: v_readfirstlane_b32 s3, v6 ; GFX10-NEXT: v_readfirstlane_b32 s4, v7 ; GFX10-NEXT: v_readfirstlane_b32 s5, v8 ; GFX10-NEXT: v_readfirstlane_b32 s6, v9 @@ -5778,46 +5769,45 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_dual_mov_b32 v5, s2 :: v_dual_mov_b32 v6, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: v_dual_mov_b32 v9, s6 :: v_dual_mov_b32 v10, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 +; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 ; GFX11-NEXT: s_mov_b32 s8, s10 ; GFX11-NEXT: s_mov_b32 s9, s11 ; GFX11-NEXT: s_mov_b32 s10, s12 ; GFX11-NEXT: s_mov_b32 s11, s13 ; GFX11-NEXT: s_mov_b32 s12, s14 ; GFX11-NEXT: s_mov_b32 s13, s15 -; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14 -; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12 +; GFX11-NEXT: v_dual_mov_b32 v15, s12 :: v_dual_mov_b32 v16, s13 ; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10 ; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8 -; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6 -; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 -; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 6, v2 -; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 ; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v0 :: v_dual_cndmask_b32 v8, v8, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 6, v2 ; GFX11-NEXT: v_readfirstlane_b32 s2, v5 ; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v0 :: v_dual_cndmask_b32 v2, v12, v1 -; GFX11-NEXT: v_readfirstlane_b32 s3, v6 ; GFX11-NEXT: v_cndmask_b32_e64 v12, v13, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v13, v14, v1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v15, v0, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v16, v1, s1 ; GFX11-NEXT: v_readfirstlane_b32 s0, v3 ; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: v_readfirstlane_b32 s3, v6 ; GFX11-NEXT: v_readfirstlane_b32 s4, v7 ; GFX11-NEXT: v_readfirstlane_b32 s5, v8 ; GFX11-NEXT: v_readfirstlane_b32 s6, v9 @@ -6040,35 +6030,35 @@ entry: define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg %vec, double %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v5f64_s_v_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: v_mov_b32_e32 v11, s9 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s0 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 0 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: v_mov_b32_e32 v5, s3 ; GPRIDX-NEXT: v_mov_b32_e32 v4, s2 ; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v6, s4 ; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 2 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: v_mov_b32_e32 v9, s7 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s6 ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 3 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s9 ; GPRIDX-NEXT: v_mov_b32_e32 v10, s8 ; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v1, vcc @@ -6089,35 +6079,35 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg ; ; GFX10-LABEL: dyn_insertelement_v5f64_s_v_s: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s12, 1 ; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: v_mov_b32_e32 v11, s9 +; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: v_mov_b32_e32 v10, s8 +; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; GFX10-NEXT: v_mov_b32_e32 v9, s7 ; GFX10-NEXT: v_mov_b32_e32 v8, s6 ; GFX10-NEXT: v_mov_b32_e32 v7, s5 ; GFX10-NEXT: v_mov_b32_e32 v6, s4 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 -; GFX10-NEXT: v_mov_b32_e32 v4, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s12, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s12, 4 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v0, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 2 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v1, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s12, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s12, 4 ; GFX10-NEXT: v_readfirstlane_b32 s2, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc_lo @@ -6138,29 +6128,29 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg ; ; GFX11-LABEL: dyn_insertelement_v5f64_s_v_s: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s3, s5 -; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: s_mov_b32 s9, s11 ; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s12, 1 ; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: s_mov_b32 s8, s10 -; GFX11-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v11, s9 ; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 ; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 -; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s12, 1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s12, 4 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v0 :: v_dual_cndmask_b32 v3, v3, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v0, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 2 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v1, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s12, 3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s12, 4 ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 ; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v0 :: v_dual_cndmask_b32 v7, v7, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v0, s0 @@ -6185,30 +6175,30 @@ entry: define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg %vec, double %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v5f64_s_v_v: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 ; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 ; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 ; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 ; GPRIDX-NEXT: v_mov_b32_e32 v10, s7 ; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 @@ -6234,35 +6224,35 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg ; ; GFX10-LABEL: dyn_insertelement_v5f64_s_v_v: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, s2 +; GFX10-NEXT: v_mov_b32_e32 v6, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: v_mov_b32_e32 v12, s9 +; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: v_mov_b32_e32 v11, s8 +; GFX10-NEXT: v_mov_b32_e32 v12, s9 ; GFX10-NEXT: v_mov_b32_e32 v10, s7 ; GFX10-NEXT: v_mov_b32_e32 v9, s6 ; GFX10-NEXT: v_mov_b32_e32 v8, s5 ; GFX10-NEXT: v_mov_b32_e32 v7, s4 -; GFX10-NEXT: v_mov_b32_e32 v6, s3 -; GFX10-NEXT: v_mov_b32_e32 v5, s2 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 4, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 4, v2 ; GFX10-NEXT: v_readfirstlane_b32 s2, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc_lo @@ -6283,29 +6273,29 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg ; ; GFX11-LABEL: dyn_insertelement_v5f64_s_v_v: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s3, s5 -; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: s_mov_b32 s9, s11 ; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_dual_mov_b32 v5, s2 :: v_dual_mov_b32 v6, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 ; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: s_mov_b32 s8, s10 -; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_mov_b32 v12, s9 ; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6 ; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 -; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 4, v2 ; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 4, v2 ; GFX11-NEXT: v_readfirstlane_b32 s2, v5 ; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v0 :: v_dual_cndmask_b32 v2, v8, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v8, v9, v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll index 588802cbd56c7..272dfaf59848a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll @@ -34,8 +34,8 @@ define amdgpu_kernel void @fcmp_uniform_select(float %a, i32 %b, i32 %c, ptr add ; GFX8-NEXT: s_and_b32 s0, s0, 1 ; GFX8-NEXT: s_cmp_lg_u32 s0, 0 ; GFX8-NEXT: s_cselect_b32 s0, s1, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll index 7fd981c3f3fc6..fa0030c566743 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -17,8 +17,8 @@ define amdgpu_kernel void @use_lds_globals(ptr addrspace(1) %out, ptr addrspace( ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_add_u32 s0, s0, 4 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-NEXT: v_mov_b32_e32 v3, 9 ; CHECK-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll index 6846137272ec6..bb44bd0be28af 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -194,11 +194,11 @@ define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double ; GFX7-NEXT: s_cmp_eq_u32 s6, 0 ; GFX7-NEXT: s_cselect_b32 s6, 1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_and_b32 s0, 1, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: s_and_b32 s0, 1, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX7-NEXT: s_nop 3 @@ -212,11 +212,11 @@ define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double ; GFX8-NEXT: s_cmp_eq_u32 s6, 0 ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_and_b32 s0, 1, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_and_b32 s0, 1, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: s_nop 3 @@ -230,9 +230,9 @@ define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double ; GFX10_W32-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10_W32-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10_W32-NEXT: v_mov_b32_e32 v2, s4 -; GFX10_W32-NEXT: s_and_b32 s6, 1, s6 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX10_W32-NEXT: s_and_b32 s6, 1, s6 +; GFX10_W32-NEXT: v_mov_b32_e32 v2, s4 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 ; GFX10_W32-NEXT: v_mov_b32_e32 v3, s5 ; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3] @@ -245,9 +245,9 @@ define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double ; GFX10_W64-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10_W64-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10_W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10_W64-NEXT: s_and_b32 s6, 1, s6 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 +; GFX10_W64-NEXT: s_and_b32 s6, 1, s6 +; GFX10_W64-NEXT: v_mov_b32_e32 v2, s4 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 ; GFX10_W64-NEXT: v_mov_b32_e32 v3, s5 ; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3] @@ -273,9 +273,9 @@ define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double ; GFX11_W64-NEXT: s_cmp_eq_u32 s6, 0 ; GFX11_W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX11_W64-NEXT: s_cselect_b32 s6, 1, 0 -; GFX11_W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11_W64-NEXT: s_and_b32 s6, 1, s6 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, s3 +; GFX11_W64-NEXT: s_and_b32 s6, 1, s6 +; GFX11_W64-NEXT: v_mov_b32_e32 v2, s4 ; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 ; GFX11_W64-NEXT: v_mov_b32_e32 v3, s5 ; GFX11_W64-NEXT: v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3] @@ -729,11 +729,11 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX7-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-NEXT: v_mov_b32_e32 v2, s12 -; GFX7-NEXT: v_mov_b32_e32 v4, s14 ; GFX7-NEXT: s_and_b32 s0, 1, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-NEXT: v_mov_b32_e32 v2, s12 ; GFX7-NEXT: v_mov_b32_e32 v3, s13 +; GFX7-NEXT: v_mov_b32_e32 v4, s14 ; GFX7-NEXT: v_mov_b32_e32 v5, s15 ; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX7-NEXT: s_mov_b32 s10, -1 @@ -749,11 +749,11 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x20 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NEXT: s_and_b32 s0, 1, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NEXT: v_mov_b32_e32 v5, s15 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: s_nop 3 @@ -771,9 +771,9 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: s_and_b32 s0, 1, s0 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s12 -; GFX10_W32-NEXT: v_mov_b32_e32 v2, s14 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s13 +; GFX10_W32-NEXT: v_mov_b32_e32 v2, s14 ; GFX10_W32-NEXT: v_mov_b32_e32 v3, s15 ; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[10:11], v[0:1], v[2:3] ; GFX10_W32-NEXT: v_mov_b32_e32 v2, 0 @@ -788,9 +788,9 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: s_and_b32 s0, 1, s0 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s12 -; GFX10_W64-NEXT: v_mov_b32_e32 v2, s14 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s13 +; GFX10_W64-NEXT: v_mov_b32_e32 v2, s14 ; GFX10_W64-NEXT: v_mov_b32_e32 v3, s15 ; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[10:11], v[0:1], v[2:3] ; GFX10_W64-NEXT: v_mov_b32_e32 v2, 0 @@ -820,9 +820,9 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W64-NEXT: s_and_b32 s8, 1, s8 ; GFX11_W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX11_W64-NEXT: v_mov_b32_e32 v2, s6 ; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, s5 +; GFX11_W64-NEXT: v_mov_b32_e32 v2, s6 ; GFX11_W64-NEXT: v_mov_b32_e32 v3, s7 ; GFX11_W64-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] ; GFX11_W64-NEXT: v_mov_b32_e32 v2, 0 @@ -1208,9 +1208,9 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2 ; GFX8-NEXT: s_and_b64 vcc, vcc, s[2:3] -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -1368,8 +1368,8 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_dwordx3 v[1:3], v[1:2] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll index 0535394d1025c..c3e9df721f368 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -379,10 +379,10 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -453,10 +453,10 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s4, v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -527,10 +527,10 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s4, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -601,10 +601,10 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1213,10 +1213,10 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, 1.0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1282,10 +1282,10 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], 2.0, 2.0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll index 2d0d04e1b533e..898c5794414fe 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll @@ -112,19 +112,20 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 -; GFX11-NEXT: v_mov_b32_e32 v11, v7 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v2, v9 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v11, v7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[10:11] @@ -137,17 +138,19 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr ; GFX12-NEXT: s_mov_b32 s0, s2 ; GFX12-NEXT: s_mov_b32 s1, s3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v9, v7 -; GFX12-NEXT: v_dual_mov_b32 v10, v7 :: v_dual_mov_b32 v11, v7 +; GFX12-NEXT: v_mov_b32_e32 v9, v7 ; GFX12-NEXT: s_mov_b32 s2, s4 ; GFX12-NEXT: s_mov_b32 s3, s5 ; GFX12-NEXT: s_mov_b32 s4, s6 ; GFX12-NEXT: s_mov_b32 s5, s7 +; GFX12-NEXT: v_mov_b32_e32 v2, v9 +; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v11, v7 +; GFX12-NEXT: v_mov_b32_e32 v10, v7 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: v_mov_b32_e32 v4, v11 +; GFX12-NEXT: v_dual_mov_b32 v3, v10 :: v_dual_mov_b32 v4, v11 ; GFX12-NEXT: image_load v[0:4], [v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v7, v4, s[10:11] @@ -221,19 +224,20 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 -; GFX11-NEXT: v_mov_b32_e32 v11, v7 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v2, v9 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v11, v7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[10:11] @@ -246,17 +250,19 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX12-NEXT: s_mov_b32 s0, s2 ; GFX12-NEXT: s_mov_b32 s1, s3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v9, v7 -; GFX12-NEXT: v_dual_mov_b32 v10, v7 :: v_dual_mov_b32 v11, v7 +; GFX12-NEXT: v_mov_b32_e32 v9, v7 ; GFX12-NEXT: s_mov_b32 s2, s4 ; GFX12-NEXT: s_mov_b32 s3, s5 ; GFX12-NEXT: s_mov_b32 s4, s6 ; GFX12-NEXT: s_mov_b32 s5, s7 +; GFX12-NEXT: v_mov_b32_e32 v2, v9 +; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v11, v7 +; GFX12-NEXT: v_mov_b32_e32 v10, v7 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: v_mov_b32_e32 v4, v11 +; GFX12-NEXT: v_dual_mov_b32 v3, v10 :: v_dual_mov_b32 v4, v11 ; GFX12-NEXT: image_load v[0:4], [v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v7, v4, s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll index 676bd8856ce6e..bf3262deca0c6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll @@ -66,15 +66,13 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r ; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v8, v5 ; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -83,6 +81,8 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-NEXT: v_mov_b32_e32 v2, v7 ; GFX9-NEXT: v_mov_b32_e32 v3, v8 @@ -190,15 +190,13 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre ; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v8, v5 ; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -207,6 +205,8 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-NEXT: v_mov_b32_e32 v2, v7 ; GFX9-NEXT: v_mov_b32_e32 v3, v8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll index a101a15ea8140..138efac05f041 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll @@ -112,24 +112,24 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r ; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v9, 0 -; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v8, v3 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v10, v9 ; GFX11-NEXT: v_mov_b32_e32 v11, v9 -; GFX11-NEXT: v_mov_b32_e32 v12, v9 -; GFX11-NEXT: v_mov_b32_e32 v13, v9 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_mov_b32_e32 v2, v11 +; GFX11-NEXT: v_mov_b32_e32 v12, v9 +; GFX11-NEXT: v_mov_b32_e32 v10, v9 +; GFX11-NEXT: v_mov_b32_e32 v13, v9 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 -; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 -; GFX11-NEXT: v_mov_b32_e32 v4, v13 +; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v3, v12 +; GFX11-NEXT: v_dual_mov_b32 v1, v10 :: v_dual_mov_b32 v4, v13 ; GFX11-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v9, v4, s[10:11] @@ -138,22 +138,23 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r ; GFX12-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v9, 0 -; GFX12-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX12-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v8, v3 +; GFX12-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v11, v9 -; GFX12-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v9 +; GFX12-NEXT: v_mov_b32_e32 v11, v9 ; GFX12-NEXT: s_mov_b32 s0, s2 ; GFX12-NEXT: s_mov_b32 s1, s3 ; GFX12-NEXT: s_mov_b32 s2, s4 ; GFX12-NEXT: s_mov_b32 s3, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, v11 +; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v13, v9 +; GFX12-NEXT: v_mov_b32_e32 v12, v9 ; GFX12-NEXT: s_mov_b32 s4, s6 ; GFX12-NEXT: s_mov_b32 s5, s7 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 -; GFX12-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 -; GFX12-NEXT: v_mov_b32_e32 v4, v13 +; GFX12-NEXT: v_dual_mov_b32 v3, v12 :: v_dual_mov_b32 v4, v13 ; GFX12-NEXT: image_load v[0:4], [v5, v6, v7, v8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v9, v4, s[10:11] @@ -227,24 +228,24 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre ; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v9, 0 -; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v8, v3 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v10, v9 ; GFX11-NEXT: v_mov_b32_e32 v11, v9 -; GFX11-NEXT: v_mov_b32_e32 v12, v9 -; GFX11-NEXT: v_mov_b32_e32 v13, v9 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_mov_b32_e32 v2, v11 +; GFX11-NEXT: v_mov_b32_e32 v12, v9 +; GFX11-NEXT: v_mov_b32_e32 v10, v9 +; GFX11-NEXT: v_mov_b32_e32 v13, v9 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 -; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 -; GFX11-NEXT: v_mov_b32_e32 v4, v13 +; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v3, v12 +; GFX11-NEXT: v_dual_mov_b32 v1, v10 :: v_dual_mov_b32 v4, v13 ; GFX11-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v9, v4, s[10:11] @@ -253,22 +254,23 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre ; GFX12-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v9, 0 -; GFX12-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX12-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v8, v3 +; GFX12-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v11, v9 -; GFX12-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v9 +; GFX12-NEXT: v_mov_b32_e32 v11, v9 ; GFX12-NEXT: s_mov_b32 s0, s2 ; GFX12-NEXT: s_mov_b32 s1, s3 ; GFX12-NEXT: s_mov_b32 s2, s4 ; GFX12-NEXT: s_mov_b32 s3, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, v11 +; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v13, v9 +; GFX12-NEXT: v_mov_b32_e32 v12, v9 ; GFX12-NEXT: s_mov_b32 s4, s6 ; GFX12-NEXT: s_mov_b32 s5, s7 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 -; GFX12-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 -; GFX12-NEXT: v_mov_b32_e32 v4, v13 +; GFX12-NEXT: v_dual_mov_b32 v3, v12 :: v_dual_mov_b32 v4, v13 ; GFX12-NEXT: image_load v[0:4], [v5, v6, v7, v8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v9, v4, s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll index b20dc4b539276..f7dc1e4e9d323 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll @@ -59,15 +59,12 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-NEXT: v_mov_b32_e32 v10, v7 ; GFX9-NEXT: v_mov_b32_e32 v11, v7 -; GFX9-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -76,6 +73,9 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-NEXT: v_mov_b32_e32 v1, v8 ; GFX9-NEXT: v_mov_b32_e32 v2, v9 ; GFX9-NEXT: v_mov_b32_e32 v3, v10 @@ -117,23 +117,23 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 -; GFX11-NEXT: v_mov_b32_e32 v9, v6 -; GFX11-NEXT: v_mov_b32_e32 v10, v6 -; GFX11-NEXT: v_mov_b32_e32 v12, v2 -; GFX11-NEXT: v_lshl_or_b32 v11, v1, 16, v0 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v10, v6 +; GFX11-NEXT: v_mov_b32_e32 v12, v2 +; GFX11-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-NEXT: v_mov_b32_e32 v4, v10 ; GFX11-NEXT: image_load v[0:4], v[11:12], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -155,8 +155,8 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr ; GFX12-NEXT: s_mov_b32 s5, s7 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 -; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 ; GFX12-NEXT: v_mov_b32_e32 v4, v10 ; GFX12-NEXT: image_load v[0:4], [v11, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -172,15 +172,12 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-NEXT: v_mov_b32_e32 v10, v7 ; GFX9-NEXT: v_mov_b32_e32 v11, v7 -; GFX9-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -189,6 +186,9 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-NEXT: v_mov_b32_e32 v1, v8 ; GFX9-NEXT: v_mov_b32_e32 v2, v9 ; GFX9-NEXT: v_mov_b32_e32 v3, v10 @@ -230,23 +230,23 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 -; GFX11-NEXT: v_mov_b32_e32 v9, v6 -; GFX11-NEXT: v_mov_b32_e32 v10, v6 -; GFX11-NEXT: v_mov_b32_e32 v12, v2 -; GFX11-NEXT: v_lshl_or_b32 v11, v1, 16, v0 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v10, v6 +; GFX11-NEXT: v_mov_b32_e32 v12, v2 +; GFX11-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-NEXT: v_mov_b32_e32 v4, v10 ; GFX11-NEXT: image_load v[0:4], v[11:12], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -268,8 +268,8 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX12-NEXT: s_mov_b32 s5, s7 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 -; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 ; GFX12-NEXT: v_mov_b32_e32 v4, v10 ; GFX12-NEXT: image_load v[0:4], [v11, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll index 7f32d8e6e16b4..add2149a70226 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll @@ -114,8 +114,8 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 ; GFX11-NEXT: s_mov_b32 s2, s4 @@ -124,9 +124,9 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 -; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[10:11] @@ -225,8 +225,8 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 ; GFX11-NEXT: s_mov_b32 s2, s4 @@ -235,9 +235,9 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 -; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll index c1c383eb583aa..48a854a71a088 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll @@ -698,7 +698,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0110(<8 x i32> inreg %rsrc, i32 % define amdgpu_ps void @image_store_f32_dmask_1111(<8 x i32> inreg %rsrc, i32 inreg %s, i32 inreg %t, float %in) #0 { ; GFX6-LABEL: image_store_f32_dmask_1111: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v1, s10 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -707,13 +706,13 @@ define amdgpu_ps void @image_store_f32_dmask_1111(<8 x i32> inreg %rsrc, i32 inr ; GFX6-NEXT: s_mov_b32 s5, s7 ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v1, s10 ; GFX6-NEXT: v_mov_b32_e32 v2, s11 ; GFX6-NEXT: image_store v0, v[1:2], s[0:7] dmask:0xf unorm ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: image_store_f32_dmask_1111: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v1, s10 ; GFX8-NEXT: s_mov_b32 s0, s2 ; GFX8-NEXT: s_mov_b32 s1, s3 ; GFX8-NEXT: s_mov_b32 s2, s4 @@ -722,6 +721,7 @@ define amdgpu_ps void @image_store_f32_dmask_1111(<8 x i32> inreg %rsrc, i32 inr ; GFX8-NEXT: s_mov_b32 s5, s7 ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s10 ; GFX8-NEXT: v_mov_b32_e32 v2, s11 ; GFX8-NEXT: image_store v0, v[1:2], s[0:7] dmask:0xf unorm ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index c23afeb63a06a..e0fe31e0d1090 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -669,22 +669,24 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_mov_b32 s8, 0x40400000 ; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 ; GFX11-NEXT: s_mov_b32 s9, 4.0 -; GFX11-NEXT: s_mov_b32 s8, 0x40400000 -; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 -; GFX11-NEXT: v_mov_b32_e32 v6, s12 -; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13 +; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX11-NEXT: v_mov_b32_e32 v8, s14 +; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v5, s10 +; GFX11-NEXT: v_mov_b32_e32 v7, s13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: s_mov_b32 s1, 1.0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: s_mov_b32 s2, 2.0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -692,9 +694,9 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s8 +; GFX11-NEXT: v_mov_b32_e32 v4, s9 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -781,15 +783,17 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX11-NEXT: s_mov_b32 s8, 0x42004600 ; GFX11-NEXT: s_mov_b32 s9, 0x44004700 ; GFX11-NEXT: s_mov_b32 s10, 0x45004800 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v5, s10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: s_mov_b32 s1, 1.0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: s_mov_b32 s2, 2.0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -797,9 +801,9 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s8 +; GFX11-NEXT: v_mov_b32_e32 v4, s9 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -891,28 +895,28 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s5, 1.0 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: v_mov_b32_e32 v9, 0xb36211c7 -; GFX11-NEXT: s_mov_b32 s8, 0x40400000 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 ; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 ; GFX11-NEXT: s_mov_b32 s9, 4.0 +; GFX11-NEXT: s_mov_b32 s8, 0x40400000 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 +; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13 ; GFX11-NEXT: v_mov_b32_e32 v6, s12 +; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v9, 0xb36211c7 ; GFX11-NEXT: v_bfrev_b32_e32 v10, 4.0 -; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v3, s8 -; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13 +; GFX11-NEXT: v_mov_b32_e32 v5, s10 +; GFX11-NEXT: v_mov_b32_e32 v3, s8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 ; GFX11-NEXT: s_mov_b32 s6, 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: flat_load_b32 v11, v[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -995,24 +999,24 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s5, 1.0 -; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_mov_b32 s8, 0x42004600 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_mov_b32 s9, 0x44004700 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_mov_b32 s10, 0x45004800 -; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v6, 0xb36211c6 :: v_dual_mov_b32 v5, s10 ; GFX11-NEXT: v_bfrev_b32_e32 v7, 4.0 -; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9 +; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v3, s8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 ; GFX11-NEXT: s_mov_b32 s6, 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: flat_load_b32 v8, v[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll index cc21305a5a193..e28155d63a0b4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll @@ -24,7 +24,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[38:39], s[38:39] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a16, s16 ; GCN-NEXT: v_accvgpr_write_b32 a1, s1 ; GCN-NEXT: v_accvgpr_write_b32 a2, s2 ; GCN-NEXT: v_accvgpr_write_b32 a3, s3 @@ -40,6 +39,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # ; GCN-NEXT: v_accvgpr_write_b32 a13, s13 ; GCN-NEXT: v_accvgpr_write_b32 a14, s14 ; GCN-NEXT: v_accvgpr_write_b32 a15, s15 +; GCN-NEXT: v_accvgpr_write_b32 a16, s16 ; GCN-NEXT: v_accvgpr_write_b32 a17, s17 ; GCN-NEXT: v_accvgpr_write_b32 a18, s18 ; GCN-NEXT: v_accvgpr_write_b32 a19, s19 @@ -319,12 +319,12 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GCN-NEXT: v_accvgpr_write_b32 a0, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GCN-NEXT: v_accvgpr_write_b32 a2, s8 -; GCN-NEXT: v_accvgpr_write_b32 a4, s8 -; GCN-NEXT: v_accvgpr_write_b32 a6, s6 ; GCN-NEXT: v_accvgpr_write_b32 a1, s9 +; GCN-NEXT: v_accvgpr_write_b32 a2, s8 ; GCN-NEXT: v_accvgpr_write_b32 a3, s9 +; GCN-NEXT: v_accvgpr_write_b32 a4, s8 ; GCN-NEXT: v_accvgpr_write_b32 a5, s9 +; GCN-NEXT: v_accvgpr_write_b32 a6, s6 ; GCN-NEXT: v_accvgpr_write_b32 a7, s7 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1] ; GCN-NEXT: s_nop 1 @@ -351,12 +351,12 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GCN-NEXT: v_accvgpr_write_b32 a0, s6 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GCN-NEXT: v_accvgpr_write_b32 a2, s6 -; GCN-NEXT: v_accvgpr_write_b32 a4, s6 -; GCN-NEXT: v_accvgpr_write_b32 a6, s6 ; GCN-NEXT: v_accvgpr_write_b32 a1, s7 +; GCN-NEXT: v_accvgpr_write_b32 a2, s6 ; GCN-NEXT: v_accvgpr_write_b32 a3, s7 +; GCN-NEXT: v_accvgpr_write_b32 a4, s6 ; GCN-NEXT: v_accvgpr_write_b32 a5, s7 +; GCN-NEXT: v_accvgpr_write_b32 a6, s6 ; GCN-NEXT: v_accvgpr_write_b32 a7, s7 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[8:9], s[8:9] op_sel:[0,1] ; GCN-NEXT: s_nop 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll index 90e2840f0d667..570a33fa6f753 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -12,9 +12,9 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) { ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -52,8 +52,9 @@ define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll index e5d9884e5ee29..28d1bd0eb3b63 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -10,10 +10,10 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll index b1de0eff05d30..3bcc1a6a3affd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -291,10 +291,10 @@ define amdgpu_ps void @test_s_load_constant_v8i32_align1(ptr addrspace(4) inreg ; GFX9-NEXT: v_readfirstlane_b32 s7, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s8, v4 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_readfirstlane_b32 s9, v5 ; GFX9-NEXT: v_readfirstlane_b32 s10, v6 ; GFX9-NEXT: v_readfirstlane_b32 s11, v7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 @@ -322,10 +322,10 @@ define amdgpu_ps void @test_s_load_constant_v8i32_align1(ptr addrspace(4) inreg ; GFX7-NEXT: v_readfirstlane_b32 s3, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s8, v4 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v5 ; GFX7-NEXT: v_readfirstlane_b32 s10, v6 ; GFX7-NEXT: v_readfirstlane_b32 s11, v7 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 @@ -383,9 +383,9 @@ define amdgpu_ps void @test_s_load_constant_v8i32_align1(ptr addrspace(4) inreg ; GFX11-NEXT: v_readfirstlane_b32 s11, v7 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s11 -; GFX11-NEXT: v_dual_mov_b32 v5, s9 :: v_dual_mov_b32 v6, s10 +; GFX11-NEXT: v_dual_mov_b32 v5, s9 :: v_dual_mov_b32 v4, s8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_mov_b32 v7, s11 :: v_dual_mov_b32 v6, s10 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3] ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[2:3] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll index 27005e7aa175e..7704463ca8006 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll @@ -406,12 +406,12 @@ define amdgpu_ps void @load_uniform_P1_v2i32(ptr addrspace(1) inreg %ptra, ptr a ; GFX7-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: v_readfirstlane_b32 s1, v3 -; GFX7-NEXT: v_readfirstlane_b32 s5, v5 ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 +; GFX7-NEXT: v_readfirstlane_b32 s1, v3 ; GFX7-NEXT: v_readfirstlane_b32 s4, v4 -; GFX7-NEXT: s_add_i32 s1, s1, s5 +; GFX7-NEXT: v_readfirstlane_b32 s5, v5 ; GFX7-NEXT: s_add_i32 s0, s0, s4 +; GFX7-NEXT: s_add_i32 s1, s1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 @@ -425,14 +425,14 @@ define amdgpu_ps void @load_uniform_P1_v2i32(ptr addrspace(1) inreg %ptra, ptr a ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] ; GFX11-NEXT: global_load_b64 v[4:5], v4, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s1, v3 -; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11-NEXT: s_add_i32 s1, s1, s3 +; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: s_add_i32 s0, s0, s2 +; GFX11-NEXT: s_add_i32 s1, s1, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; @@ -443,14 +443,14 @@ define amdgpu_ps void @load_uniform_P1_v2i32(ptr addrspace(1) inreg %ptra, ptr a ; GFX12-NEXT: global_load_b64 v[2:3], v4, s[0:1] ; GFX12-NEXT: global_load_b64 v[4:5], v4, s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s1, v3 -; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NEXT: v_readfirstlane_b32 s1, v3 ; GFX12-NEXT: v_readfirstlane_b32 s2, v4 -; GFX12-NEXT: s_add_co_i32 s1, s1, s3 +; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: s_add_co_i32 s0, s0, s2 +; GFX12-NEXT: s_add_co_i32 s1, s1, s3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(1) %ptra, align 2 @@ -471,9 +471,9 @@ define amdgpu_ps void @load_uniform_P1_v3i32_gfx12(ptr addrspace(1) inreg %ptra, ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 -; GFX7-NEXT: v_readfirstlane_b32 s4, v5 ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 ; GFX7-NEXT: v_readfirstlane_b32 s6, v4 +; GFX7-NEXT: v_readfirstlane_b32 s4, v5 ; GFX7-NEXT: v_readfirstlane_b32 s5, v6 ; GFX7-NEXT: v_readfirstlane_b32 s7, v7 ; GFX7-NEXT: s_add_i32 s4, s0, s4 @@ -493,18 +493,17 @@ define amdgpu_ps void @load_uniform_P1_v3i32_gfx12(ptr addrspace(1) inreg %ptra, ; GFX11-NEXT: global_load_b96 v[2:4], v5, s[0:1] ; GFX11-NEXT: global_load_b96 v[5:7], v5, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11-NEXT: v_readfirstlane_b32 s5, v7 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-NEXT: v_readfirstlane_b32 s2, v4 ; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: v_readfirstlane_b32 s4, v6 -; GFX11-NEXT: s_add_i32 s2, s2, s5 +; GFX11-NEXT: v_readfirstlane_b32 s5, v7 ; GFX11-NEXT: s_add_i32 s0, s0, s3 ; GFX11-NEXT: s_add_i32 s1, s1, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 -; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: s_add_i32 s2, s2, s5 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v4, s2 ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: s_endpgm ; @@ -515,18 +514,17 @@ define amdgpu_ps void @load_uniform_P1_v3i32_gfx12(ptr addrspace(1) inreg %ptra, ; GFX12-NEXT: global_load_b96 v[2:4], v5, s[0:1] ; GFX12-NEXT: global_load_b96 v[5:7], v5, s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s2, v4 -; GFX12-NEXT: v_readfirstlane_b32 s5, v7 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12-NEXT: v_readfirstlane_b32 s2, v4 ; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: v_readfirstlane_b32 s4, v6 -; GFX12-NEXT: s_add_co_i32 s2, s2, s5 +; GFX12-NEXT: v_readfirstlane_b32 s5, v7 ; GFX12-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12-NEXT: s_add_co_i32 s1, s1, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: s_add_co_i32 s2, s2, s5 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_mov_b32_e32 v4, s2 ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX12-NEXT: s_endpgm %a = load <3 x i32>, ptr addrspace(1) %ptra, align 2 @@ -547,10 +545,10 @@ define amdgpu_ps void @load_uniform_P1_v4i32(ptr addrspace(1) inreg %ptra, ptr a ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 -; GFX7-NEXT: v_readfirstlane_b32 s4, v6 ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 ; GFX7-NEXT: v_readfirstlane_b32 s6, v4 ; GFX7-NEXT: v_readfirstlane_b32 s7, v5 +; GFX7-NEXT: v_readfirstlane_b32 s4, v6 ; GFX7-NEXT: v_readfirstlane_b32 s5, v7 ; GFX7-NEXT: v_readfirstlane_b32 s8, v8 ; GFX7-NEXT: v_readfirstlane_b32 s9, v9 @@ -573,21 +571,20 @@ define amdgpu_ps void @load_uniform_P1_v4i32(ptr addrspace(1) inreg %ptra, ptr a ; GFX11-NEXT: global_load_b128 v[2:5], v6, s[0:1] ; GFX11-NEXT: global_load_b128 v[6:9], v6, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s3, v5 -; GFX11-NEXT: v_readfirstlane_b32 s7, v9 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: v_readfirstlane_b32 s4, v6 ; GFX11-NEXT: v_readfirstlane_b32 s5, v7 ; GFX11-NEXT: v_readfirstlane_b32 s6, v8 -; GFX11-NEXT: s_add_i32 s3, s3, s7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v9 ; GFX11-NEXT: s_add_i32 s0, s0, s4 ; GFX11-NEXT: s_add_i32 s1, s1, s5 ; GFX11-NEXT: s_add_i32 s2, s2, s6 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_add_i32 s3, s3, s7 +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: s_endpgm ; @@ -598,21 +595,20 @@ define amdgpu_ps void @load_uniform_P1_v4i32(ptr addrspace(1) inreg %ptra, ptr a ; GFX12-NEXT: global_load_b128 v[2:5], v6, s[0:1] ; GFX12-NEXT: global_load_b128 v[6:9], v6, s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s3, v5 -; GFX12-NEXT: v_readfirstlane_b32 s7, v9 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 ; GFX12-NEXT: v_readfirstlane_b32 s2, v4 +; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: v_readfirstlane_b32 s4, v6 ; GFX12-NEXT: v_readfirstlane_b32 s5, v7 ; GFX12-NEXT: v_readfirstlane_b32 s6, v8 -; GFX12-NEXT: s_add_co_i32 s3, s3, s7 +; GFX12-NEXT: v_readfirstlane_b32 s7, v9 ; GFX12-NEXT: s_add_co_i32 s0, s0, s4 ; GFX12-NEXT: s_add_co_i32 s1, s1, s5 ; GFX12-NEXT: s_add_co_i32 s2, s2, s6 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: s_add_co_i32 s3, s3, s7 +; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: s_endpgm %a = load <4 x i32>, ptr addrspace(1) %ptra, align 2 @@ -638,29 +634,29 @@ define amdgpu_ps void @load_uniform_P1_v8i32(ptr addrspace(1) inreg %ptra, ptr a ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_readfirstlane_b32 s4, v2 ; GFX7-NEXT: v_readfirstlane_b32 s5, v3 -; GFX7-NEXT: v_readfirstlane_b32 s12, v10 ; GFX7-NEXT: v_readfirstlane_b32 s6, v4 ; GFX7-NEXT: v_readfirstlane_b32 s7, v5 -; GFX7-NEXT: v_readfirstlane_b32 s8, v6 +; GFX7-NEXT: v_readfirstlane_b32 s12, v10 ; GFX7-NEXT: v_readfirstlane_b32 s13, v11 ; GFX7-NEXT: v_readfirstlane_b32 s14, v12 ; GFX7-NEXT: v_readfirstlane_b32 s15, v13 -; GFX7-NEXT: v_readfirstlane_b32 s16, v14 -; GFX7-NEXT: s_add_i32 s4, s4, s12 +; GFX7-NEXT: v_readfirstlane_b32 s8, v6 ; GFX7-NEXT: v_readfirstlane_b32 s9, v7 ; GFX7-NEXT: v_readfirstlane_b32 s10, v8 ; GFX7-NEXT: v_readfirstlane_b32 s11, v9 +; GFX7-NEXT: v_readfirstlane_b32 s16, v14 ; GFX7-NEXT: v_readfirstlane_b32 s17, v15 ; GFX7-NEXT: v_readfirstlane_b32 s18, v16 ; GFX7-NEXT: v_readfirstlane_b32 s19, v17 +; GFX7-NEXT: s_add_i32 s4, s4, s12 ; GFX7-NEXT: s_add_i32 s5, s5, s13 ; GFX7-NEXT: s_add_i32 s6, s6, s14 ; GFX7-NEXT: s_add_i32 s7, s7, s15 ; GFX7-NEXT: s_add_i32 s8, s8, s16 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_add_i32 s9, s9, s17 ; GFX7-NEXT: s_add_i32 s10, s10, s18 ; GFX7-NEXT: s_add_i32 s11, s11, s19 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s7 @@ -682,30 +678,30 @@ define amdgpu_ps void @load_uniform_P1_v8i32(ptr addrspace(1) inreg %ptra, ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 -; GFX11-NEXT: v_readfirstlane_b32 s11, v13 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11-NEXT: v_readfirstlane_b32 s7, v9 +; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: v_readfirstlane_b32 s8, v10 ; GFX11-NEXT: v_readfirstlane_b32 s9, v11 ; GFX11-NEXT: v_readfirstlane_b32 s10, v12 -; GFX11-NEXT: v_readfirstlane_b32 s15, v17 +; GFX11-NEXT: v_readfirstlane_b32 s11, v13 ; GFX11-NEXT: v_readfirstlane_b32 s4, v6 ; GFX11-NEXT: v_readfirstlane_b32 s5, v7 ; GFX11-NEXT: v_readfirstlane_b32 s6, v8 +; GFX11-NEXT: v_readfirstlane_b32 s7, v9 ; GFX11-NEXT: v_readfirstlane_b32 s12, v14 ; GFX11-NEXT: v_readfirstlane_b32 s13, v15 ; GFX11-NEXT: v_readfirstlane_b32 s14, v16 -; GFX11-NEXT: s_add_i32 s3, s3, s11 +; GFX11-NEXT: v_readfirstlane_b32 s15, v17 ; GFX11-NEXT: s_add_i32 s0, s0, s8 ; GFX11-NEXT: s_add_i32 s1, s1, s9 ; GFX11-NEXT: s_add_i32 s2, s2, s10 -; GFX11-NEXT: s_add_i32 s7, s7, s15 +; GFX11-NEXT: s_add_i32 s3, s3, s11 ; GFX11-NEXT: s_add_i32 s4, s4, s12 ; GFX11-NEXT: s_add_i32 s5, s5, s13 ; GFX11-NEXT: s_add_i32 s6, s6, s14 +; GFX11-NEXT: s_add_i32 s7, s7, s15 ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 @@ -725,30 +721,30 @@ define amdgpu_ps void @load_uniform_P1_v8i32(ptr addrspace(1) inreg %ptra, ptr a ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 -; GFX12-NEXT: v_readfirstlane_b32 s11, v13 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 ; GFX12-NEXT: v_readfirstlane_b32 s2, v4 -; GFX12-NEXT: v_readfirstlane_b32 s7, v9 +; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: v_readfirstlane_b32 s8, v10 ; GFX12-NEXT: v_readfirstlane_b32 s9, v11 ; GFX12-NEXT: v_readfirstlane_b32 s10, v12 -; GFX12-NEXT: v_readfirstlane_b32 s15, v17 +; GFX12-NEXT: v_readfirstlane_b32 s11, v13 ; GFX12-NEXT: v_readfirstlane_b32 s4, v6 ; GFX12-NEXT: v_readfirstlane_b32 s5, v7 ; GFX12-NEXT: v_readfirstlane_b32 s6, v8 +; GFX12-NEXT: v_readfirstlane_b32 s7, v9 ; GFX12-NEXT: v_readfirstlane_b32 s12, v14 ; GFX12-NEXT: v_readfirstlane_b32 s13, v15 ; GFX12-NEXT: v_readfirstlane_b32 s14, v16 -; GFX12-NEXT: s_add_co_i32 s3, s3, s11 +; GFX12-NEXT: v_readfirstlane_b32 s15, v17 ; GFX12-NEXT: s_add_co_i32 s0, s0, s8 ; GFX12-NEXT: s_add_co_i32 s1, s1, s9 ; GFX12-NEXT: s_add_co_i32 s2, s2, s10 -; GFX12-NEXT: s_add_co_i32 s7, s7, s15 +; GFX12-NEXT: s_add_co_i32 s3, s3, s11 ; GFX12-NEXT: s_add_co_i32 s4, s4, s12 ; GFX12-NEXT: s_add_co_i32 s5, s5, s13 ; GFX12-NEXT: s_add_co_i32 s6, s6, s14 +; GFX12-NEXT: s_add_co_i32 s7, s7, s15 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 @@ -822,30 +818,30 @@ define amdgpu_ps void @load_uniform_P1_v16i32(ptr addrspace(1) inreg %ptra, ptr ; GFX7-NEXT: v_readfirstlane_b32 s36, v17 ; GFX7-NEXT: s_add_i32 s7, s7, s23 ; GFX7-NEXT: s_add_i32 s8, s8, s24 -; GFX7-NEXT: s_add_i32 s12, s12, s28 -; GFX7-NEXT: s_add_i32 s16, s16, s33 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_add_i32 s9, s9, s25 ; GFX7-NEXT: s_add_i32 s10, s10, s26 ; GFX7-NEXT: s_add_i32 s11, s11, s27 +; GFX7-NEXT: s_add_i32 s12, s12, s28 ; GFX7-NEXT: s_add_i32 s13, s13, s29 ; GFX7-NEXT: s_add_i32 s14, s14, s30 ; GFX7-NEXT: s_add_i32 s15, s15, s31 +; GFX7-NEXT: s_add_i32 s16, s16, s33 ; GFX7-NEXT: s_add_i32 s17, s17, s34 ; GFX7-NEXT: s_add_i32 s18, s18, s35 ; GFX7-NEXT: s_add_i32 s19, s19, s36 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-NEXT: v_mov_b32_e32 v6, s8 -; GFX7-NEXT: v_mov_b32_e32 v10, s12 -; GFX7-NEXT: v_mov_b32_e32 v14, s16 ; GFX7-NEXT: v_mov_b32_e32 v7, s9 ; GFX7-NEXT: v_mov_b32_e32 v8, s10 ; GFX7-NEXT: v_mov_b32_e32 v9, s11 +; GFX7-NEXT: v_mov_b32_e32 v10, s12 ; GFX7-NEXT: v_mov_b32_e32 v11, s13 ; GFX7-NEXT: v_mov_b32_e32 v12, s14 ; GFX7-NEXT: v_mov_b32_e32 v13, s15 +; GFX7-NEXT: v_mov_b32_e32 v14, s16 ; GFX7-NEXT: v_mov_b32_e32 v15, s17 ; GFX7-NEXT: v_mov_b32_e32 v16, s18 ; GFX7-NEXT: v_mov_b32_e32 v17, s19 @@ -871,57 +867,57 @@ define amdgpu_ps void @load_uniform_P1_v16i32(ptr addrspace(1) inreg %ptra, ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b128 v[30:33], v30, s[0:1] offset:48 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11-NEXT: v_readfirstlane_b32 s19, v21 -; GFX11-NEXT: v_readfirstlane_b32 s7, v9 +; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: v_readfirstlane_b32 s16, v18 ; GFX11-NEXT: v_readfirstlane_b32 s17, v19 ; GFX11-NEXT: v_readfirstlane_b32 s18, v20 -; GFX11-NEXT: v_readfirstlane_b32 s23, v25 +; GFX11-NEXT: v_readfirstlane_b32 s19, v21 ; GFX11-NEXT: v_readfirstlane_b32 s4, v6 ; GFX11-NEXT: v_readfirstlane_b32 s5, v7 ; GFX11-NEXT: v_readfirstlane_b32 s6, v8 -; GFX11-NEXT: v_readfirstlane_b32 s11, v13 +; GFX11-NEXT: v_readfirstlane_b32 s7, v9 ; GFX11-NEXT: v_readfirstlane_b32 s20, v22 ; GFX11-NEXT: v_readfirstlane_b32 s21, v23 ; GFX11-NEXT: v_readfirstlane_b32 s22, v24 -; GFX11-NEXT: v_readfirstlane_b32 s27, v29 +; GFX11-NEXT: v_readfirstlane_b32 s23, v25 ; GFX11-NEXT: v_readfirstlane_b32 s8, v10 ; GFX11-NEXT: v_readfirstlane_b32 s9, v11 ; GFX11-NEXT: v_readfirstlane_b32 s10, v12 -; GFX11-NEXT: v_readfirstlane_b32 s15, v17 +; GFX11-NEXT: v_readfirstlane_b32 s11, v13 ; GFX11-NEXT: v_readfirstlane_b32 s24, v26 ; GFX11-NEXT: v_readfirstlane_b32 s25, v27 ; GFX11-NEXT: v_readfirstlane_b32 s26, v28 -; GFX11-NEXT: v_readfirstlane_b32 s31, v33 +; GFX11-NEXT: v_readfirstlane_b32 s27, v29 ; GFX11-NEXT: v_readfirstlane_b32 s12, v14 ; GFX11-NEXT: v_readfirstlane_b32 s13, v15 ; GFX11-NEXT: v_readfirstlane_b32 s14, v16 +; GFX11-NEXT: v_readfirstlane_b32 s15, v17 ; GFX11-NEXT: v_readfirstlane_b32 s28, v30 ; GFX11-NEXT: v_readfirstlane_b32 s29, v31 ; GFX11-NEXT: v_readfirstlane_b32 s30, v32 -; GFX11-NEXT: s_add_i32 s3, s3, s19 +; GFX11-NEXT: v_readfirstlane_b32 s31, v33 ; GFX11-NEXT: s_add_i32 s0, s0, s16 ; GFX11-NEXT: s_add_i32 s1, s1, s17 ; GFX11-NEXT: s_add_i32 s2, s2, s18 -; GFX11-NEXT: s_add_i32 s7, s7, s23 +; GFX11-NEXT: s_add_i32 s3, s3, s19 ; GFX11-NEXT: s_add_i32 s4, s4, s20 ; GFX11-NEXT: s_add_i32 s5, s5, s21 ; GFX11-NEXT: s_add_i32 s6, s6, s22 -; GFX11-NEXT: s_add_i32 s11, s11, s27 -; GFX11-NEXT: v_mov_b32_e32 v5, s3 +; GFX11-NEXT: s_add_i32 s7, s7, s23 ; GFX11-NEXT: s_add_i32 s8, s8, s24 ; GFX11-NEXT: s_add_i32 s9, s9, s25 ; GFX11-NEXT: s_add_i32 s10, s10, s26 -; GFX11-NEXT: s_add_i32 s15, s15, s31 -; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 +; GFX11-NEXT: s_add_i32 s11, s11, s27 +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v9, s7 ; GFX11-NEXT: s_add_i32 s12, s12, s28 ; GFX11-NEXT: s_add_i32 s13, s13, s29 ; GFX11-NEXT: s_add_i32 s14, s14, s30 +; GFX11-NEXT: s_add_i32 s15, s15, s31 ; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 ; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 ; GFX11-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 @@ -951,57 +947,57 @@ define amdgpu_ps void @load_uniform_P1_v16i32(ptr addrspace(1) inreg %ptra, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_load_b128 v[30:33], v30, s[0:1] offset:48 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 ; GFX12-NEXT: v_readfirstlane_b32 s2, v4 -; GFX12-NEXT: v_readfirstlane_b32 s19, v21 -; GFX12-NEXT: v_readfirstlane_b32 s7, v9 +; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: v_readfirstlane_b32 s16, v18 ; GFX12-NEXT: v_readfirstlane_b32 s17, v19 ; GFX12-NEXT: v_readfirstlane_b32 s18, v20 -; GFX12-NEXT: v_readfirstlane_b32 s23, v25 +; GFX12-NEXT: v_readfirstlane_b32 s19, v21 ; GFX12-NEXT: v_readfirstlane_b32 s4, v6 ; GFX12-NEXT: v_readfirstlane_b32 s5, v7 ; GFX12-NEXT: v_readfirstlane_b32 s6, v8 -; GFX12-NEXT: v_readfirstlane_b32 s11, v13 +; GFX12-NEXT: v_readfirstlane_b32 s7, v9 ; GFX12-NEXT: v_readfirstlane_b32 s20, v22 ; GFX12-NEXT: v_readfirstlane_b32 s21, v23 ; GFX12-NEXT: v_readfirstlane_b32 s22, v24 -; GFX12-NEXT: v_readfirstlane_b32 s27, v29 +; GFX12-NEXT: v_readfirstlane_b32 s23, v25 ; GFX12-NEXT: v_readfirstlane_b32 s8, v10 ; GFX12-NEXT: v_readfirstlane_b32 s9, v11 ; GFX12-NEXT: v_readfirstlane_b32 s10, v12 -; GFX12-NEXT: v_readfirstlane_b32 s15, v17 +; GFX12-NEXT: v_readfirstlane_b32 s11, v13 ; GFX12-NEXT: v_readfirstlane_b32 s24, v26 ; GFX12-NEXT: v_readfirstlane_b32 s25, v27 ; GFX12-NEXT: v_readfirstlane_b32 s26, v28 -; GFX12-NEXT: v_readfirstlane_b32 s31, v33 +; GFX12-NEXT: v_readfirstlane_b32 s27, v29 ; GFX12-NEXT: v_readfirstlane_b32 s12, v14 ; GFX12-NEXT: v_readfirstlane_b32 s13, v15 ; GFX12-NEXT: v_readfirstlane_b32 s14, v16 +; GFX12-NEXT: v_readfirstlane_b32 s15, v17 ; GFX12-NEXT: v_readfirstlane_b32 s28, v30 ; GFX12-NEXT: v_readfirstlane_b32 s29, v31 ; GFX12-NEXT: v_readfirstlane_b32 s30, v32 -; GFX12-NEXT: s_add_co_i32 s3, s3, s19 +; GFX12-NEXT: v_readfirstlane_b32 s31, v33 ; GFX12-NEXT: s_add_co_i32 s0, s0, s16 ; GFX12-NEXT: s_add_co_i32 s1, s1, s17 ; GFX12-NEXT: s_add_co_i32 s2, s2, s18 -; GFX12-NEXT: s_add_co_i32 s7, s7, s23 +; GFX12-NEXT: s_add_co_i32 s3, s3, s19 ; GFX12-NEXT: s_add_co_i32 s4, s4, s20 ; GFX12-NEXT: s_add_co_i32 s5, s5, s21 ; GFX12-NEXT: s_add_co_i32 s6, s6, s22 -; GFX12-NEXT: s_add_co_i32 s11, s11, s27 -; GFX12-NEXT: v_mov_b32_e32 v5, s3 +; GFX12-NEXT: s_add_co_i32 s7, s7, s23 ; GFX12-NEXT: s_add_co_i32 s8, s8, s24 ; GFX12-NEXT: s_add_co_i32 s9, s9, s25 ; GFX12-NEXT: s_add_co_i32 s10, s10, s26 -; GFX12-NEXT: s_add_co_i32 s15, s15, s31 -; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 +; GFX12-NEXT: s_add_co_i32 s11, s11, s27 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_mov_b32_e32 v9, s7 ; GFX12-NEXT: s_add_co_i32 s12, s12, s28 ; GFX12-NEXT: s_add_co_i32 s13, s13, s29 ; GFX12-NEXT: s_add_co_i32 s14, s14, s30 +; GFX12-NEXT: s_add_co_i32 s15, s15, s31 ; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 ; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 ; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 @@ -1615,11 +1611,11 @@ define amdgpu_ps void @load_uniform_P4_v2i32(ptr addrspace(4) inreg %ptra, ptr a ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_readfirstlane_b32 s5, v3 ; GFX7-NEXT: v_readfirstlane_b32 s4, v2 +; GFX7-NEXT: v_readfirstlane_b32 s5, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_i32 s1, s5, s1 ; GFX7-NEXT: s_add_i32 s0, s4, s0 +; GFX7-NEXT: s_add_i32 s1, s5, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 @@ -1638,7 +1634,7 @@ define amdgpu_ps void @load_uniform_P4_v2i32(ptr addrspace(4) inreg %ptra, ptr a ; GFX11-NEXT: s_add_i32 s0, s2, s0 ; GFX11-NEXT: s_add_i32 s1, s3, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; @@ -1654,7 +1650,7 @@ define amdgpu_ps void @load_uniform_P4_v2i32(ptr addrspace(4) inreg %ptra, ptr a ; GFX12-NEXT: s_add_co_i32 s0, s2, s0 ; GFX12-NEXT: s_add_co_i32 s1, s3, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(4) %ptra, align 2 @@ -1693,16 +1689,15 @@ define amdgpu_ps void @load_uniform_P4_v3i32_gfx12(ptr addrspace(4) inreg %ptra, ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_load_b96 v[2:4], v2, s[0:1] ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s5, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s3, v2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v3 -; GFX11-NEXT: s_add_i32 s2, s5, s2 +; GFX11-NEXT: v_readfirstlane_b32 s5, v4 ; GFX11-NEXT: s_add_i32 s0, s3, s0 ; GFX11-NEXT: s_add_i32 s1, s4, s1 +; GFX11-NEXT: s_add_i32 s2, s5, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v4, s2 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: s_endpgm @@ -1713,15 +1708,15 @@ define amdgpu_ps void @load_uniform_P4_v3i32_gfx12(ptr addrspace(4) inreg %ptra, ; GFX12-NEXT: global_load_b96 v[2:4], v2, s[0:1] ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v4 ; GFX12-NEXT: v_readfirstlane_b32 s3, v2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v3 +; GFX12-NEXT: v_readfirstlane_b32 s5, v4 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s2, s5, s2 ; GFX12-NEXT: s_add_co_i32 s0, s3, s0 ; GFX12-NEXT: s_add_co_i32 s1, s4, s1 +; GFX12-NEXT: s_add_co_i32 s2, s5, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX12-NEXT: s_endpgm @@ -1765,18 +1760,17 @@ define amdgpu_ps void @load_uniform_P4_v4i32(ptr addrspace(4) inreg %ptra, ptr a ; GFX11-NEXT: global_load_b128 v[2:5], v2, s[0:1] ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s7, v5 ; GFX11-NEXT: v_readfirstlane_b32 s4, v2 ; GFX11-NEXT: v_readfirstlane_b32 s5, v3 ; GFX11-NEXT: v_readfirstlane_b32 s6, v4 +; GFX11-NEXT: v_readfirstlane_b32 s7, v5 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s3, s7, s3 ; GFX11-NEXT: s_add_i32 s0, s4, s0 ; GFX11-NEXT: s_add_i32 s1, s5, s1 ; GFX11-NEXT: s_add_i32 s2, s6, s2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_add_i32 s3, s7, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: s_endpgm ; @@ -1786,18 +1780,17 @@ define amdgpu_ps void @load_uniform_P4_v4i32(ptr addrspace(4) inreg %ptra, ptr a ; GFX12-NEXT: global_load_b128 v[2:5], v2, s[0:1] ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s7, v5 ; GFX12-NEXT: v_readfirstlane_b32 s4, v2 ; GFX12-NEXT: v_readfirstlane_b32 s5, v3 ; GFX12-NEXT: v_readfirstlane_b32 s6, v4 +; GFX12-NEXT: v_readfirstlane_b32 s7, v5 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s3, s7, s3 ; GFX12-NEXT: s_add_co_i32 s0, s4, s0 ; GFX12-NEXT: s_add_co_i32 s1, s5, s1 ; GFX12-NEXT: s_add_co_i32 s2, s6, s2 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: s_add_co_i32 s3, s7, s3 +; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: s_endpgm %a = load <4 x i32>, ptr addrspace(4) %ptra, align 2 @@ -1825,19 +1818,19 @@ define amdgpu_ps void @load_uniform_P4_v8i32(ptr addrspace(4) inreg %ptra, ptr a ; GFX7-NEXT: v_readfirstlane_b32 s15, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s16, v6 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_i32 s4, s12, s4 ; GFX7-NEXT: v_readfirstlane_b32 s17, v7 ; GFX7-NEXT: v_readfirstlane_b32 s18, v8 ; GFX7-NEXT: v_readfirstlane_b32 s19, v9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s12, s4 ; GFX7-NEXT: s_add_i32 s5, s13, s5 ; GFX7-NEXT: s_add_i32 s6, s14, s6 ; GFX7-NEXT: s_add_i32 s7, s15, s7 ; GFX7-NEXT: s_add_i32 s8, s16, s8 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_add_i32 s9, s17, s9 ; GFX7-NEXT: s_add_i32 s10, s18, s10 ; GFX7-NEXT: s_add_i32 s11, s19, s11 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s7 @@ -1857,24 +1850,24 @@ define amdgpu_ps void @load_uniform_P4_v8i32(ptr addrspace(4) inreg %ptra, ptr a ; GFX11-NEXT: global_load_b128 v[6:9], v6, s[0:1] offset:16 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_readfirstlane_b32 s11, v5 ; GFX11-NEXT: v_readfirstlane_b32 s8, v2 ; GFX11-NEXT: v_readfirstlane_b32 s9, v3 ; GFX11-NEXT: v_readfirstlane_b32 s10, v4 +; GFX11-NEXT: v_readfirstlane_b32 s11, v5 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s15, v9 ; GFX11-NEXT: v_readfirstlane_b32 s12, v6 ; GFX11-NEXT: v_readfirstlane_b32 s13, v7 ; GFX11-NEXT: v_readfirstlane_b32 s14, v8 +; GFX11-NEXT: v_readfirstlane_b32 s15, v9 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s3, s11, s3 ; GFX11-NEXT: s_add_i32 s0, s8, s0 ; GFX11-NEXT: s_add_i32 s1, s9, s1 ; GFX11-NEXT: s_add_i32 s2, s10, s2 -; GFX11-NEXT: s_add_i32 s7, s15, s7 +; GFX11-NEXT: s_add_i32 s3, s11, s3 ; GFX11-NEXT: s_add_i32 s4, s12, s4 ; GFX11-NEXT: s_add_i32 s5, s13, s5 ; GFX11-NEXT: s_add_i32 s6, s14, s6 +; GFX11-NEXT: s_add_i32 s7, s15, s7 ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 @@ -1892,24 +1885,24 @@ define amdgpu_ps void @load_uniform_P4_v8i32(ptr addrspace(4) inreg %ptra, ptr a ; GFX12-NEXT: global_load_b128 v[6:9], v6, s[0:1] offset:16 ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x1 -; GFX12-NEXT: v_readfirstlane_b32 s11, v5 ; GFX12-NEXT: v_readfirstlane_b32 s8, v2 ; GFX12-NEXT: v_readfirstlane_b32 s9, v3 ; GFX12-NEXT: v_readfirstlane_b32 s10, v4 +; GFX12-NEXT: v_readfirstlane_b32 s11, v5 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s15, v9 ; GFX12-NEXT: v_readfirstlane_b32 s12, v6 ; GFX12-NEXT: v_readfirstlane_b32 s13, v7 ; GFX12-NEXT: v_readfirstlane_b32 s14, v8 +; GFX12-NEXT: v_readfirstlane_b32 s15, v9 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s3, s11, s3 ; GFX12-NEXT: s_add_co_i32 s0, s8, s0 ; GFX12-NEXT: s_add_co_i32 s1, s9, s1 ; GFX12-NEXT: s_add_co_i32 s2, s10, s2 -; GFX12-NEXT: s_add_co_i32 s7, s15, s7 +; GFX12-NEXT: s_add_co_i32 s3, s11, s3 ; GFX12-NEXT: s_add_co_i32 s4, s12, s4 ; GFX12-NEXT: s_add_co_i32 s5, s13, s5 ; GFX12-NEXT: s_add_co_i32 s6, s14, s6 +; GFX12-NEXT: s_add_co_i32 s7, s15, s7 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 @@ -1945,49 +1938,49 @@ define amdgpu_ps void @load_uniform_P4_v16i32(ptr addrspace(4) inreg %ptra, ptr ; GFX7-NEXT: v_readfirstlane_b32 s23, v5 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_readfirstlane_b32 s24, v6 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_readfirstlane_b32 s28, v10 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_readfirstlane_b32 s33, v14 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_i32 s4, s20, s4 ; GFX7-NEXT: v_readfirstlane_b32 s25, v7 ; GFX7-NEXT: v_readfirstlane_b32 s26, v8 ; GFX7-NEXT: v_readfirstlane_b32 s27, v9 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_readfirstlane_b32 s28, v10 ; GFX7-NEXT: v_readfirstlane_b32 s29, v11 ; GFX7-NEXT: v_readfirstlane_b32 s30, v12 ; GFX7-NEXT: v_readfirstlane_b32 s31, v13 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s33, v14 ; GFX7-NEXT: v_readfirstlane_b32 s34, v15 ; GFX7-NEXT: v_readfirstlane_b32 s35, v16 ; GFX7-NEXT: v_readfirstlane_b32 s36, v17 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s20, s4 ; GFX7-NEXT: s_add_i32 s5, s21, s5 ; GFX7-NEXT: s_add_i32 s6, s22, s6 ; GFX7-NEXT: s_add_i32 s7, s23, s7 ; GFX7-NEXT: s_add_i32 s8, s24, s8 -; GFX7-NEXT: s_add_i32 s12, s28, s12 -; GFX7-NEXT: s_add_i32 s16, s33, s16 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_add_i32 s9, s25, s9 ; GFX7-NEXT: s_add_i32 s10, s26, s10 ; GFX7-NEXT: s_add_i32 s11, s27, s11 +; GFX7-NEXT: s_add_i32 s12, s28, s12 ; GFX7-NEXT: s_add_i32 s13, s29, s13 ; GFX7-NEXT: s_add_i32 s14, s30, s14 ; GFX7-NEXT: s_add_i32 s15, s31, s15 +; GFX7-NEXT: s_add_i32 s16, s33, s16 ; GFX7-NEXT: s_add_i32 s17, s34, s17 ; GFX7-NEXT: s_add_i32 s18, s35, s18 ; GFX7-NEXT: s_add_i32 s19, s36, s19 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-NEXT: v_mov_b32_e32 v6, s8 -; GFX7-NEXT: v_mov_b32_e32 v10, s12 -; GFX7-NEXT: v_mov_b32_e32 v14, s16 ; GFX7-NEXT: v_mov_b32_e32 v7, s9 ; GFX7-NEXT: v_mov_b32_e32 v8, s10 ; GFX7-NEXT: v_mov_b32_e32 v9, s11 +; GFX7-NEXT: v_mov_b32_e32 v10, s12 ; GFX7-NEXT: v_mov_b32_e32 v11, s13 ; GFX7-NEXT: v_mov_b32_e32 v12, s14 ; GFX7-NEXT: v_mov_b32_e32 v13, s15 +; GFX7-NEXT: v_mov_b32_e32 v14, s16 ; GFX7-NEXT: v_mov_b32_e32 v15, s17 ; GFX7-NEXT: v_mov_b32_e32 v16, s18 ; GFX7-NEXT: v_mov_b32_e32 v17, s19 @@ -2007,45 +2000,45 @@ define amdgpu_ps void @load_uniform_P4_v16i32(ptr addrspace(4) inreg %ptra, ptr ; GFX11-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:48 ; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_readfirstlane_b32 s19, v5 ; GFX11-NEXT: v_readfirstlane_b32 s16, v2 ; GFX11-NEXT: v_readfirstlane_b32 s17, v3 ; GFX11-NEXT: v_readfirstlane_b32 s18, v4 +; GFX11-NEXT: v_readfirstlane_b32 s19, v5 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_readfirstlane_b32 s23, v9 ; GFX11-NEXT: v_readfirstlane_b32 s20, v6 ; GFX11-NEXT: v_readfirstlane_b32 s21, v7 ; GFX11-NEXT: v_readfirstlane_b32 s22, v8 +; GFX11-NEXT: v_readfirstlane_b32 s23, v9 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_readfirstlane_b32 s27, v13 ; GFX11-NEXT: v_readfirstlane_b32 s24, v10 ; GFX11-NEXT: v_readfirstlane_b32 s25, v11 ; GFX11-NEXT: v_readfirstlane_b32 s26, v12 +; GFX11-NEXT: v_readfirstlane_b32 s27, v13 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s31, v17 ; GFX11-NEXT: v_readfirstlane_b32 s28, v14 ; GFX11-NEXT: v_readfirstlane_b32 s29, v15 ; GFX11-NEXT: v_readfirstlane_b32 s30, v16 +; GFX11-NEXT: v_readfirstlane_b32 s31, v17 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s3, s19, s3 ; GFX11-NEXT: s_add_i32 s0, s16, s0 ; GFX11-NEXT: s_add_i32 s1, s17, s1 ; GFX11-NEXT: s_add_i32 s2, s18, s2 -; GFX11-NEXT: s_add_i32 s7, s23, s7 +; GFX11-NEXT: s_add_i32 s3, s19, s3 ; GFX11-NEXT: s_add_i32 s4, s20, s4 ; GFX11-NEXT: s_add_i32 s5, s21, s5 ; GFX11-NEXT: s_add_i32 s6, s22, s6 -; GFX11-NEXT: s_add_i32 s11, s27, s11 -; GFX11-NEXT: v_mov_b32_e32 v5, s3 +; GFX11-NEXT: s_add_i32 s7, s23, s7 ; GFX11-NEXT: s_add_i32 s8, s24, s8 ; GFX11-NEXT: s_add_i32 s9, s25, s9 ; GFX11-NEXT: s_add_i32 s10, s26, s10 -; GFX11-NEXT: s_add_i32 s15, s31, s15 -; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 +; GFX11-NEXT: s_add_i32 s11, s27, s11 +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v9, s7 ; GFX11-NEXT: s_add_i32 s12, s28, s12 ; GFX11-NEXT: s_add_i32 s13, s29, s13 ; GFX11-NEXT: s_add_i32 s14, s30, s14 +; GFX11-NEXT: s_add_i32 s15, s31, s15 ; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 ; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 ; GFX11-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 @@ -2069,45 +2062,45 @@ define amdgpu_ps void @load_uniform_P4_v16i32(ptr addrspace(4) inreg %ptra, ptr ; GFX12-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:48 ; GFX12-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x3 -; GFX12-NEXT: v_readfirstlane_b32 s19, v5 ; GFX12-NEXT: v_readfirstlane_b32 s16, v2 ; GFX12-NEXT: v_readfirstlane_b32 s17, v3 ; GFX12-NEXT: v_readfirstlane_b32 s18, v4 +; GFX12-NEXT: v_readfirstlane_b32 s19, v5 ; GFX12-NEXT: s_wait_loadcnt 0x2 -; GFX12-NEXT: v_readfirstlane_b32 s23, v9 ; GFX12-NEXT: v_readfirstlane_b32 s20, v6 ; GFX12-NEXT: v_readfirstlane_b32 s21, v7 ; GFX12-NEXT: v_readfirstlane_b32 s22, v8 +; GFX12-NEXT: v_readfirstlane_b32 s23, v9 ; GFX12-NEXT: s_wait_loadcnt 0x1 -; GFX12-NEXT: v_readfirstlane_b32 s27, v13 ; GFX12-NEXT: v_readfirstlane_b32 s24, v10 ; GFX12-NEXT: v_readfirstlane_b32 s25, v11 ; GFX12-NEXT: v_readfirstlane_b32 s26, v12 +; GFX12-NEXT: v_readfirstlane_b32 s27, v13 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s31, v17 ; GFX12-NEXT: v_readfirstlane_b32 s28, v14 ; GFX12-NEXT: v_readfirstlane_b32 s29, v15 ; GFX12-NEXT: v_readfirstlane_b32 s30, v16 +; GFX12-NEXT: v_readfirstlane_b32 s31, v17 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s3, s19, s3 ; GFX12-NEXT: s_add_co_i32 s0, s16, s0 ; GFX12-NEXT: s_add_co_i32 s1, s17, s1 ; GFX12-NEXT: s_add_co_i32 s2, s18, s2 -; GFX12-NEXT: s_add_co_i32 s7, s23, s7 +; GFX12-NEXT: s_add_co_i32 s3, s19, s3 ; GFX12-NEXT: s_add_co_i32 s4, s20, s4 ; GFX12-NEXT: s_add_co_i32 s5, s21, s5 ; GFX12-NEXT: s_add_co_i32 s6, s22, s6 -; GFX12-NEXT: s_add_co_i32 s11, s27, s11 -; GFX12-NEXT: v_mov_b32_e32 v5, s3 +; GFX12-NEXT: s_add_co_i32 s7, s23, s7 ; GFX12-NEXT: s_add_co_i32 s8, s24, s8 ; GFX12-NEXT: s_add_co_i32 s9, s25, s9 ; GFX12-NEXT: s_add_co_i32 s10, s26, s10 -; GFX12-NEXT: s_add_co_i32 s15, s31, s15 -; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 +; GFX12-NEXT: s_add_co_i32 s11, s27, s11 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_mov_b32_e32 v9, s7 ; GFX12-NEXT: s_add_co_i32 s12, s28, s12 ; GFX12-NEXT: s_add_co_i32 s13, s29, s13 ; GFX12-NEXT: s_add_co_i32 s14, s30, s14 +; GFX12-NEXT: s_add_co_i32 s15, s31, s15 ; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 ; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 ; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll index b75eb737534e9..499118d03ba27 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -98,8 +98,9 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967296(ptr addrspace(1) in ; GFX12: ; %bb.0: ; GFX12-NEXT: s_add_co_u32 s0, s2, 0 ; GFX12-NEXT: s_add_co_ci_u32 s1, s3, 4 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296 @@ -136,8 +137,9 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967297(ptr addrspace(1) in ; GFX12: ; %bb.0: ; GFX12-NEXT: s_add_co_u32 s0, s2, 4 ; GFX12-NEXT: s_add_co_ci_u32 s1, s3, 4 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967297 @@ -313,12 +315,12 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_sgpr_offset(ptr addrspace(1) inreg % ; GFX6: ; %bb.0: ; GFX6-NEXT: s_ashr_i32 s5, s4, 31 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -327,12 +329,12 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_sgpr_offset(ptr addrspace(1) inreg % ; GFX7: ; %bb.0: ; GFX7-NEXT: s_ashr_i32 s5, s4, 31 ; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm @@ -911,11 +913,11 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(ptr addrspace(1) inreg % ; GFX6: ; %bb.0: ; GFX6-NEXT: s_ashr_i32 s5, s4, 31 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -925,11 +927,11 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(ptr addrspace(1) inreg % ; GFX7: ; %bb.0: ; GFX7-NEXT: s_ashr_i32 s5, s4, 31 ; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1508,8 +1510,8 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1) ; GFX12: ; %bb.0: ; GFX12-NEXT: s_add_co_u32 s0, s2, 0 ; GFX12-NEXT: s_add_co_ci_u32 s1, s3, 4 -; GFX12-NEXT: v_mov_b32_e32 v2, v0 -; GFX12-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 +; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s0 +; GFX12-NEXT: v_mov_b32_e32 v4, s1 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[3:4], v[1:2], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1644,9 +1646,9 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(ptr addrspace(1) inre ; ; GFX12-LABEL: mubuf_cmpxchg_sgpr_ptr_vgpr_offset: ; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 1462b5965c0ab..f6645c3dc5dbd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -3247,8 +3247,8 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX8-NEXT: s_ashr_i32 s3, s2, 31 -; GFX8-NEXT: s_mulk_i32 s2, 0x50 ; GFX8-NEXT: s_mulk_i32 s3, 0x50 +; GFX8-NEXT: s_mulk_i32 s2, 0x50 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_add_u32 s3, s3, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll index e0581f01dda6a..6526d4f76d8ad 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll @@ -596,13 +596,13 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_or_b32 s4, s3, 0x50 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -614,11 +614,11 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_or_b32 s2, s2, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -629,10 +629,10 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_or_b32 s2, s2, 0x50 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -643,10 +643,10 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_or_b32 s2, s2, 0x50 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -657,9 +657,10 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b32 s2, s2, 0x50 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -670,10 +671,12 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_or_b32 s2, s2, 0x50 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll index e450da73ab47d..6b29215a77006 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll @@ -570,8 +570,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; NEW_RBS-NEXT: s_ashr_i32 s1, s0, 31 ; NEW_RBS-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 ; NEW_RBS-NEXT: s_andn2_b32 s1, s5, exec_lo -; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3 ; NEW_RBS-NEXT: v_mov_b32_e32 v6, s2 +; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3 ; NEW_RBS-NEXT: s_and_b32 s5, exec_lo, exec_lo ; NEW_RBS-NEXT: s_or_b32 s5, s1, s5 ; NEW_RBS-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 @@ -583,8 +583,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; NEW_RBS-NEXT: s_cbranch_execz .LBB16_2 ; NEW_RBS-NEXT: ; %bb.4: ; %B ; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 -; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3 ; NEW_RBS-NEXT: v_mov_b32_e32 v6, s2 +; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3 ; NEW_RBS-NEXT: s_mov_b32 s6, exec_lo ; NEW_RBS-NEXT: v_add_co_u32 v6, vcc_lo, v4, v6 ; NEW_RBS-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v5, v7, vcc_lo @@ -595,8 +595,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; NEW_RBS-NEXT: s_cbranch_execz .LBB16_1 ; NEW_RBS-NEXT: ; %bb.5: ; %loop.body ; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 -; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3 ; NEW_RBS-NEXT: v_mov_b32_e32 v6, s2 +; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3 ; NEW_RBS-NEXT: s_add_i32 s2, s0, 1 ; NEW_RBS-NEXT: s_cmpk_lt_u32 s0, 0x64 ; NEW_RBS-NEXT: s_cselect_b32 s0, exec_lo, 0 @@ -604,8 +604,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; NEW_RBS-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo ; NEW_RBS-NEXT: s_andn2_b32 s3, s6, exec_lo ; NEW_RBS-NEXT: s_and_b32 s0, exec_lo, s0 -; NEW_RBS-NEXT: s_or_b32 s6, s3, s0 ; NEW_RBS-NEXT: global_load_dword v8, v[6:7], off +; NEW_RBS-NEXT: s_or_b32 s6, s3, s0 ; NEW_RBS-NEXT: s_mov_b32 s0, s2 ; NEW_RBS-NEXT: s_waitcnt vmcnt(0) ; NEW_RBS-NEXT: v_add_nc_u32_e32 v8, 1, v8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index 2f956d7a0a534..6369bb557c14b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -4180,8 +4180,8 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX6-NEXT: s_add_u32 s4, s0, s2 ; GFX6-NEXT: s_addc_u32 s3, s1, s3 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] @@ -4205,8 +4205,8 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX8-NEXT: s_add_u32 s4, s0, s2 ; GFX8-NEXT: s_addc_u32 s3, s1, s3 ; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] @@ -4230,8 +4230,8 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GFX9-NEXT: s_add_u32 s4, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s5, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 @@ -4558,8 +4558,8 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX6-LABEL: s_saddsat_i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s4, s0, s2 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_addc_u32 s5, s1, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 @@ -4579,8 +4579,8 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX8-LABEL: s_saddsat_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s4, s0, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_addc_u32 s5, s1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 @@ -4600,8 +4600,8 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX9-LABEL: s_saddsat_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s4, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s5, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 @@ -4921,23 +4921,23 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-LABEL: s_saddsat_v2i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s8, s0, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_addc_u32 s9, s1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 ; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 +; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 -; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX6-NEXT: s_add_u32 s0, s2, s6 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX6-NEXT: s_add_u32 s0, s2, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_addc_u32 s1, s3, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 @@ -4959,23 +4959,23 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-LABEL: s_saddsat_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s8, s0, s4 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_addc_u32 s9, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 ; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 +; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX8-NEXT: s_add_u32 s0, s2, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX8-NEXT: s_add_u32 s0, s2, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_addc_u32 s1, s3, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 @@ -4997,23 +4997,23 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-LABEL: s_saddsat_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s8, s0, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s9, s1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 ; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX9-NEXT: s_add_u32 s0, s2, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX9-NEXT: s_add_u32 s0, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_addc_u32 s1, s3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 @@ -5097,13 +5097,13 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-LABEL: s_saddsat_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s4, s0, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_addc_u32 s5, s1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_addc_u32 s8, s2, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] ; GFX6-NEXT: s_addc_u32 s9, s3, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[2:3] @@ -5139,12 +5139,12 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s4, s0, s4 ; GFX8-NEXT: s_addc_u32 s5, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_addc_u32 s8, s2, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_addc_u32 s9, s3, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[8:9], s[2:3] ; GFX8-NEXT: s_cselect_b32 s0, 1, 0 @@ -5186,12 +5186,12 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s4, s0, s4 ; GFX9-NEXT: s_addc_u32 s5, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s8, s2, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_addc_u32 s9, s3, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[8:9], s[2:3] ; GFX9-NEXT: s_cselect_b32 s0, 1, 0 @@ -5887,13 +5887,13 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-LABEL: s_saddsat_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s8, s0, s8 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_addc_u32 s9, s1, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_addc_u32 s16, s2, s10 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: s_addc_u32 s17, s3, s11 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[2:3] @@ -5917,16 +5917,16 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_mov_b32_e32 v0, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: s_add_u32 s0, s4, s12 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc -; GFX6-NEXT: s_add_u32 s0, s4, s12 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_addc_u32 s1, s5, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_addc_u32 s2, s6, s14 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: s_addc_u32 s3, s7, s15 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -5966,12 +5966,12 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s8, s0, s8 ; GFX8-NEXT: s_addc_u32 s9, s1, s9 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_addc_u32 s16, s2, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_addc_u32 s17, s3, s11 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] ; GFX8-NEXT: s_cselect_b32 s0, 1, 0 @@ -5990,27 +5990,27 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s0, s17, 31 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_add_u32 s0, s4, s12 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NEXT: v_mov_b32_e32 v3, s17 -; GFX8-NEXT: s_add_u32 s0, s4, s12 +; GFX8-NEXT: s_addc_u32 s1, s5, s13 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc -; GFX8-NEXT: s_addc_u32 s1, s5, s13 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_addc_u32 s2, s6, s14 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s3, s7, s15 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX8-NEXT: s_cselect_b32 s4, 1, 0 @@ -6056,12 +6056,12 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s8, s0, s8 ; GFX9-NEXT: s_addc_u32 s9, s1, s9 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s16, s2, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_addc_u32 s17, s3, s11 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] ; GFX9-NEXT: s_cselect_b32 s0, 1, 0 @@ -6080,27 +6080,27 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s0, s17, 31 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_add_u32 s0, s4, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: s_add_u32 s0, s4, s12 +; GFX9-NEXT: s_addc_u32 s1, s5, s13 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc -; GFX9-NEXT: s_addc_u32 s1, s5, s13 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_addc_u32 s2, s6, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_addc_u32 s3, s7, s15 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX9-NEXT: s_cselect_b32 s4, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 9d6ffc9bbc0dc..afbc14c03b6b9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -42,13 +42,13 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -642,6 +642,7 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX8-NEXT: s_sub_i32 s1, 0, s11 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_mul_lo_u32 v2, s0, v0 ; GFX8-NEXT: s_add_i32 s0, s8, s12 ; GFX8-NEXT: s_xor_b32 s0, s0, s12 @@ -688,13 +689,12 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: s_xor_b32 s0, s2, s10 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm @@ -878,6 +878,7 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX8-NEXT: s_sub_i32 s11, 0, s10 +; GFX8-NEXT: v_mov_b32_e32 v9, s5 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -982,15 +983,15 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s10, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc ; GFX8-NEXT: s_xor_b32 s0, s3, s2 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc ; GFX8-NEXT: v_xor_b32_e32 v3, s0, v7 -; GFX8-NEXT: v_xor_b32_e32 v7, s3, v8 -; GFX8-NEXT: v_mov_b32_e32 v9, s5 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s0, v3 +; GFX8-NEXT: v_xor_b32_e32 v7, s3, v8 ; GFX8-NEXT: v_mov_b32_e32 v8, s4 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s3, v7 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] @@ -2234,13 +2235,13 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_byte v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -2417,12 +2418,12 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_short v[0:1], v4 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v3 @@ -2649,13 +2650,13 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -3061,15 +3062,15 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -3215,15 +3216,15 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3 -; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll index 256d6d9a16fa9..ec77987a33527 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -257,8 +257,8 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc ; GFX8-NEXT: flat_store_dword v[2:3], v1 @@ -272,8 +272,8 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc ; GFX9-NEXT: global_store_dword v[2:3], v1, off @@ -287,8 +287,8 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; GFX10-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo ; GFX10-NEXT: global_store_dword v[2:3], v1, off @@ -347,9 +347,9 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_dword v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index c1b225562b77b..cc8c6f950ec8e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -4185,8 +4185,8 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX6-NEXT: s_sub_u32 s4, s0, s2 ; GFX6-NEXT: s_subb_u32 s3, s1, s3 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] @@ -4210,8 +4210,8 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX8-NEXT: s_sub_u32 s4, s0, s2 ; GFX8-NEXT: s_subb_u32 s3, s1, s3 ; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] @@ -4235,8 +4235,8 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GFX9-NEXT: s_sub_u32 s4, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s5, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 @@ -4563,8 +4563,8 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX6-LABEL: s_ssubsat_i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sub_u32 s4, s0, s2 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_subb_u32 s5, s1, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 @@ -4584,8 +4584,8 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX8-LABEL: s_ssubsat_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s4, s0, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_subb_u32 s5, s1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 @@ -4605,8 +4605,8 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX9-LABEL: s_ssubsat_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s4, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s5, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 @@ -4926,23 +4926,23 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-LABEL: s_ssubsat_v2i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sub_u32 s8, s0, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_subb_u32 s9, s1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 ; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 +; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 -; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX6-NEXT: s_sub_u32 s0, s2, s6 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX6-NEXT: s_sub_u32 s0, s2, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_subb_u32 s1, s3, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 @@ -4964,23 +4964,23 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-LABEL: s_ssubsat_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s8, s0, s4 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_subb_u32 s9, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 ; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 +; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX8-NEXT: s_sub_u32 s0, s2, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX8-NEXT: s_sub_u32 s0, s2, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_subb_u32 s1, s3, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 @@ -5002,23 +5002,23 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-LABEL: s_ssubsat_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s8, s0, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s9, s1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 ; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX9-NEXT: s_sub_u32 s0, s2, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX9-NEXT: s_sub_u32 s0, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_subb_u32 s1, s3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 @@ -5102,13 +5102,13 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-LABEL: s_ssubsat_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sub_u32 s8, s0, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_subb_u32 s9, s1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_subb_u32 s10, s2, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: s_subb_u32 s11, s3, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3] @@ -5146,12 +5146,12 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s8, s0, s4 ; GFX8-NEXT: s_subb_u32 s9, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_subb_u32 s10, s2, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_subb_u32 s11, s3, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] ; GFX8-NEXT: s_cselect_b32 s0, 1, 0 @@ -5195,12 +5195,12 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s8, s0, s4 ; GFX9-NEXT: s_subb_u32 s9, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s10, s2, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_subb_u32 s11, s3, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] ; GFX9-NEXT: s_cselect_b32 s0, 1, 0 @@ -5940,13 +5940,13 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-LABEL: s_ssubsat_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sub_u32 s16, s0, s8 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_subb_u32 s17, s1, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_subb_u32 s18, s2, s10 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] ; GFX6-NEXT: s_subb_u32 s19, s3, s11 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3] @@ -5972,16 +5972,16 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_mov_b32_e32 v0, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s18 ; GFX6-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NEXT: s_sub_u32 s0, s4, s12 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc -; GFX6-NEXT: s_sub_u32 s0, s4, s12 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_subb_u32 s1, s5, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_subb_u32 s2, s6, s14 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: s_subb_u32 s3, s7, s15 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -6023,12 +6023,12 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s16, s0, s8 ; GFX8-NEXT: s_subb_u32 s17, s1, s9 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_subb_u32 s18, s2, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_subb_u32 s19, s3, s11 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] ; GFX8-NEXT: s_cselect_b32 s0, 1, 0 @@ -6049,27 +6049,27 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s0, s19, 31 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NEXT: v_mov_b32_e32 v3, s17 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_sub_u32 s0, s4, s12 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v3, s19 -; GFX8-NEXT: s_sub_u32 s0, s4, s12 +; GFX8-NEXT: s_subb_u32 s1, s5, s13 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc -; GFX8-NEXT: s_subb_u32 s1, s5, s13 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_subb_u32 s2, s6, s14 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_subb_u32 s3, s7, s15 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX8-NEXT: s_cselect_b32 s4, 1, 0 @@ -6117,12 +6117,12 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s16, s0, s8 ; GFX9-NEXT: s_subb_u32 s17, s1, s9 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s18, s2, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_subb_u32 s19, s3, s11 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] ; GFX9-NEXT: s_cselect_b32 s0, 1, 0 @@ -6143,27 +6143,27 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s0, s19, 31 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_sub_u32 s0, s4, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: s_sub_u32 s0, s4, s12 +; GFX9-NEXT: s_subb_u32 s1, s5, s13 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc -; GFX9-NEXT: s_subb_u32 s1, s5, s13 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_subb_u32 s2, s6, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_subb_u32 s3, s7, s15 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX9-NEXT: s_cselect_b32 s4, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll index e2fb704599250..b65e92eb9ec26 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll @@ -363,9 +363,9 @@ define void @s_usubo_usube(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, pt ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_sub_u32 s4, s16, s18 ; GFX7-NEXT: s_subb_u32 s5, s17, s19 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_cselect_b32 s8, 1, 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -380,8 +380,8 @@ define void @s_usubo_usube(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, pt ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_sub_u32 s4, s16, s18 ; GFX9-NEXT: s_subb_u32 s5, s17, s19 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_cselect_b32 s6, 1, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX9-NEXT: v_mov_b32_e32 v0, s6 @@ -394,8 +394,8 @@ define void @s_usubo_usube(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, pt ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_sub_u32 s4, s16, s18 ; GFX8-NEXT: s_subb_u32 s5, s17, s19 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll index 017575b92143b..480ea7436c4a7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll @@ -676,8 +676,8 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) { ; GFX7-LABEL: s_ssubo_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_sub_u32 s4, s0, s2 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_subb_u32 s5, s1, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX7-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 @@ -693,8 +693,8 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) { ; GFX8-LABEL: s_ssubo_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s4, s0, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_subb_u32 s5, s1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 @@ -710,8 +710,8 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) { ; GFX9-LABEL: s_ssubo_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s4, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s5, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index c50b491bcb074..191b8dadea991 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -3,6 +3,7 @@ ; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s + define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) { ; GFX8-LABEL: udivrem_i32: ; GFX8: ; %bb.0: @@ -35,9 +36,9 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -238,16 +239,16 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v0, vcc ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v13, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v14, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v15, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -536,6 +537,7 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GFX8-NEXT: s_sub_i32 s0, 0, s10 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -576,7 +578,6 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -706,6 +707,7 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v6, s18 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, s5 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 @@ -784,7 +786,6 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s19, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX8-NEXT: v_mov_b32_e32 v9, s5 ; GFX8-NEXT: v_mov_b32_e32 v8, s4 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -1243,16 +1244,16 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s14, v9 ; GFX8-NEXT: v_subbrev_u32_e64 v0, s[0:1], 0, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v11, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v12, v14, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v9, v1, v0, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v11, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v12, v14, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -1813,9 +1814,9 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_byte v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -2128,9 +2129,9 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -2440,11 +2441,11 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -2561,11 +2562,11 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 -; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll index 2d3ce9469ee90..dce7410b0bd88 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll @@ -345,10 +345,10 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: s_lshr_b32 s2, s2, 16 ; GFX8-NEXT: s_add_u32 s0, s0, 2 -; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll index 6b749df71223f..a19ebfd56c66d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll @@ -20,15 +20,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s0 ; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6 +; GFX12-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v17, s7 ; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4 ; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 ; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 @@ -62,15 +62,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s0 ; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6 +; GFX12-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v17, s7 ; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4 ; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 ; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 @@ -102,10 +102,10 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x42004200 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v13, s3 ; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13] @@ -122,10 +122,10 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v13, s3 ; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] @@ -142,10 +142,10 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v13, s3 ; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] @@ -176,15 +176,15 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> % ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_movk_i32 s0, 0x80 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s0 ; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 ; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 ; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 ; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 @@ -218,15 +218,15 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_movk_i32 s0, 0x80 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s0 ; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6 +; GFX12-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7 ; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 ; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2 ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 @@ -260,15 +260,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i3 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s0 ; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 ; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 ; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 ; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 @@ -302,15 +302,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i3 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s0 ; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 ; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 ; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 ; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 @@ -344,15 +344,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i3 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s0 ; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 ; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 ; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 ; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 @@ -386,15 +386,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i3 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s0 ; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 ; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 ; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 ; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 @@ -428,15 +428,15 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> % ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_movk_i32 s0, 0x80 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s0 ; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 ; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 ; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 ; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll index 5344ab8da1ade..47077e025a90b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll @@ -18,11 +18,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v9, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: v_mov_b32_e32 v8, s2 +; GFX12-NEXT: v_mov_b32_e32 v9, s3 ; GFX12-NEXT: v_mov_b32_e32 v7, s1 ; GFX12-NEXT: v_mov_b32_e32 v6, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -52,11 +52,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v9, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: v_mov_b32_e32 v8, s2 +; GFX12-NEXT: v_mov_b32_e32 v9, s3 ; GFX12-NEXT: v_mov_b32_e32 v7, s1 ; GFX12-NEXT: v_mov_b32_e32 v6, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -85,11 +85,10 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x42004200 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s1 ; GFX12-NEXT: v_mov_b32_e32 v6, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v7, s1 ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7] ; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off ; GFX12-NEXT: s_endpgm @@ -103,11 +102,10 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> ; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s1 ; GFX12-NEXT: v_mov_b32_e32 v6, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v7, s1 ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] ; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off ; GFX12-NEXT: s_endpgm @@ -121,11 +119,10 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> ; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s1 ; GFX12-NEXT: v_mov_b32_e32 v6, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v7, s1 ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] ; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off ; GFX12-NEXT: s_endpgm @@ -152,11 +149,11 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_movk_i32 s0, 0x80 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 ; GFX12-NEXT: v_mov_b32_e32 v5, s1 ; GFX12-NEXT: v_mov_b32_e32 v4, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -186,11 +183,11 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_movk_i32 s0, 0x80 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 ; GFX12-NEXT: v_mov_b32_e32 v5, s1 ; GFX12-NEXT: v_mov_b32_e32 v4, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -220,11 +217,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 ; GFX12-NEXT: v_mov_b32_e32 v5, s1 ; GFX12-NEXT: v_mov_b32_e32 v4, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -254,11 +251,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 ; GFX12-NEXT: v_mov_b32_e32 v5, s1 ; GFX12-NEXT: v_mov_b32_e32 v4, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -288,11 +285,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 ; GFX12-NEXT: v_mov_b32_e32 v5, s1 ; GFX12-NEXT: v_mov_b32_e32 v4, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -322,11 +319,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 ; GFX12-NEXT: v_mov_b32_e32 v5, s1 ; GFX12-NEXT: v_mov_b32_e32 v4, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -356,11 +353,11 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_movk_i32 s0, 0x80 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 ; GFX12-NEXT: v_mov_b32_e32 v5, s1 ; GFX12-NEXT: v_mov_b32_e32 v4, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll index e882769f97ac1..4947594e414e2 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll @@ -544,11 +544,11 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__a(ptr %ptr) #0 { ; CHECK-NEXT: ; def a[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v5, a3 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a2 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[4:5] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v2, a4 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a2 ; CHECK-NEXT: v_accvgpr_read_b32 v3, a5 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 @@ -614,11 +614,11 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__v(ptr %ptr) #0 { ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v7, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: v_accvgpr_read_b32 v6, a2 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll index b8962fa29e8f1..e7b959bb50550 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll @@ -337,7 +337,6 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i32_ret_av_av_no_agprs: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse @@ -354,6 +353,7 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] @@ -464,7 +464,6 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse @@ -481,6 +480,7 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] @@ -493,12 +493,12 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a2 ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 @@ -2095,10 +2095,17 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a32 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a33 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: flat_load_dword v1, v[2:3] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a34 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 @@ -2127,13 +2134,6 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a32 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a33 -; GFX90A-NEXT: flat_load_dword v1, v[2:3] -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a34 -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a34 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2243,10 +2243,13 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: ; def a34 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a32 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a33 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: flat_load_dword v1, v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX950-NEXT: v_accvgpr_write_b32 a5, v5 ; GFX950-NEXT: v_accvgpr_write_b32 a6, v6 @@ -2275,9 +2278,6 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX950-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX950-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a32 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a33 -; GFX950-NEXT: flat_load_dword v1, v[2:3] ; GFX950-NEXT: v_accvgpr_read_b32 v4, a34 ; GFX950-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2595,9 +2595,9 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB32_6: ; %atomicrmw.phi @@ -2841,9 +2841,9 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB34_6: ; %atomicrmw.phi @@ -3200,9 +3200,9 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB37_6: ; %atomicrmw.phi @@ -4010,7 +4010,6 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xor_i32_ret_av_av_no_agprs: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse @@ -4027,6 +4026,7 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] @@ -4135,7 +4135,6 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xor_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse @@ -4152,6 +4151,7 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] @@ -4164,12 +4164,12 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a2 ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX950-NEXT: buffer_wbl2 sc1 @@ -4382,9 +4382,9 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3 ; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB53_4: ; %atomicrmw.phi @@ -4568,9 +4568,9 @@ define void @flat_atomic_xor_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3 ; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB55_4: ; %atomicrmw.phi @@ -4837,9 +4837,9 @@ define void @flat_atomic_xor_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3 ; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB58_4: ; %atomicrmw.phi @@ -6384,8 +6384,8 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB89_4: ; %atomicrmw.phi @@ -6568,9 +6568,9 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB91_4: ; %atomicrmw.phi @@ -6754,9 +6754,9 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_and_b32_e32 v3, v1, v3 ; GFX950-NEXT: v_and_b32_e32 v2, v0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB93_4: ; %atomicrmw.phi @@ -6975,9 +6975,9 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_and_b32_e32 v2, v1, v7 ; GFX950-NEXT: v_and_b32_e32 v5, v0, v6 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_not_b32_e32 v3, v2 ; GFX950-NEXT: v_not_b32_e32 v2, v5 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB95_6: ; %atomicrmw.phi @@ -7197,9 +7197,9 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX950-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB97_4: ; %atomicrmw.phi @@ -7384,10 +7384,10 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB99_4: ; %atomicrmw.phi @@ -7575,10 +7575,10 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB101_4: ; %atomicrmw.phi @@ -7766,10 +7766,10 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB103_4: ; %atomicrmw.phi @@ -7957,10 +7957,10 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB105_4: ; %atomicrmw.phi @@ -8351,9 +8351,9 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1] ; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1 ; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off ; GFX950-NEXT: .LBB109_4: ; %atomicrmw.phi @@ -10108,8 +10108,9 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(1) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen @@ -10167,8 +10168,8 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB129_6: ; %atomicrmw.phi @@ -10328,9 +10329,10 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen @@ -10377,8 +10379,8 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB131_4: ; %atomicrmw.phi @@ -10516,9 +10518,10 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen @@ -10565,8 +10568,8 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB133_4: ; %atomicrmw.phi @@ -10721,10 +10724,11 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc @@ -10988,10 +10992,11 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc @@ -14505,12 +14510,12 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB195_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -14552,12 +14557,12 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB195_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -14576,8 +14581,8 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB195_4: ; %atomicrmw.end @@ -14682,12 +14687,12 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB197_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -14729,12 +14734,12 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB197_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -14754,9 +14759,9 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB197_4: ; %atomicrmw.end @@ -14863,12 +14868,12 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB199_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -14910,12 +14915,12 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB199_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -14934,9 +14939,9 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_and_b32_e32 v1, v3, v1 ; GFX950-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB199_4: ; %atomicrmw.end @@ -15042,12 +15047,12 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_cbranch_vccz .LBB201_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15106,12 +15111,12 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB201_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15147,9 +15152,9 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_and_b32_e32 v2, v1, v5 ; GFX950-NEXT: v_and_b32_e32 v4, v0, v4 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_not_b32_e32 v3, v2 ; GFX950-NEXT: v_not_b32_e32 v2, v4 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB201_6: ; %atomicrmw.phi @@ -15292,12 +15297,12 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB203_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15339,12 +15344,12 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB203_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15363,9 +15368,9 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX950-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB203_4: ; %atomicrmw.end @@ -15471,12 +15476,12 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB205_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15518,12 +15523,12 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB205_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15542,9 +15547,9 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_xor_b32_e32 v1, v3, v1 ; GFX950-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB205_4: ; %atomicrmw.end @@ -15650,12 +15655,12 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB207_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15698,12 +15703,12 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB207_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15723,10 +15728,10 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB207_4: ; %atomicrmw.end @@ -15835,12 +15840,12 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB209_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15883,12 +15888,12 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB209_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15908,10 +15913,10 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB209_4: ; %atomicrmw.end @@ -16020,12 +16025,12 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB211_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -16068,12 +16073,12 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB211_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -16093,10 +16098,10 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB211_4: ; %atomicrmw.end @@ -16205,12 +16210,12 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB213_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -16253,12 +16258,12 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB213_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -16278,10 +16283,10 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB213_4: ; %atomicrmw.end @@ -16390,12 +16395,12 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB215_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -16440,12 +16445,12 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB215_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -16580,12 +16585,12 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB217_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -16632,12 +16637,12 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB217_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -16660,9 +16665,9 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1] ; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1 ; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s2 ; GFX950-NEXT: .LBB217_4: ; %atomicrmw.end @@ -16777,12 +16782,12 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_cbranch_vccz .LBB219_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -16843,12 +16848,12 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB219_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -17042,12 +17047,12 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_cbranch_vccz .LBB221_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -17106,12 +17111,12 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB221_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -18145,12 +18150,12 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_shared_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB235_3 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private @@ -18205,12 +18210,12 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB235_3 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private @@ -18383,12 +18388,12 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_cbranch_vccz .LBB237_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -18420,8 +18425,9 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_mov_b32_e32 v6, s4 ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(1) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen @@ -18440,12 +18446,12 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB237_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -18476,8 +18482,8 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB237_6: ; %atomicrmw.phi @@ -18607,12 +18613,12 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB239_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -18633,9 +18639,10 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen @@ -18654,12 +18661,12 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB239_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -18680,8 +18687,8 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB239_4: ; %atomicrmw.end @@ -18789,12 +18796,12 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB241_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -18815,9 +18822,10 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen @@ -18836,12 +18844,12 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB241_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -18862,8 +18870,8 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB241_4: ; %atomicrmw.end @@ -18971,12 +18979,12 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_cbranch_vccz .LBB243_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -19013,10 +19021,11 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc @@ -19036,12 +19045,12 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB243_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -19230,12 +19239,12 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_cbranch_vccz .LBB245_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -19272,10 +19281,11 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc @@ -19295,12 +19305,12 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB245_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll index b6fe0c756a106..668244a279dee 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll @@ -337,7 +337,6 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX90A-LABEL: global_atomic_xchg_i32_ret_av_av_no_agprs: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse @@ -354,6 +353,7 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] @@ -464,7 +464,6 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX950-LABEL: global_atomic_xchg_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse @@ -481,6 +480,7 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] @@ -493,12 +493,12 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a2 ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 @@ -1600,10 +1600,17 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a32 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a33 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: global_load_dword v1, v[2:3], off +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a34 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 @@ -1632,13 +1639,6 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a32 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a33 -; GFX90A-NEXT: global_load_dword v1, v[2:3], off -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a34 -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a34 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1748,10 +1748,13 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX950-NEXT: ; def a34 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a32 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a33 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: global_load_dword v1, v[2:3], off +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX950-NEXT: v_accvgpr_write_b32 a5, v5 ; GFX950-NEXT: v_accvgpr_write_b32 a6, v6 @@ -1780,9 +1783,6 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX950-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX950-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX950-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a32 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a33 -; GFX950-NEXT: global_load_dword v1, v[2:3], off ; GFX950-NEXT: v_accvgpr_read_b32 v4, a34 ; GFX950-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2991,7 +2991,6 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX90A-LABEL: global_atomic_xor_i32_ret_av_av_no_agprs: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse @@ -3008,6 +3007,7 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] @@ -3116,7 +3116,6 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX950-LABEL: global_atomic_xor_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse @@ -3133,6 +3132,7 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] @@ -3145,12 +3145,12 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a2 ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX950-NEXT: buffer_wbl2 sc1 @@ -10334,11 +10334,11 @@ define void @global_atomic_xchg_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-LABEL: global_atomic_xchg_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10352,10 +10352,11 @@ define void @global_atomic_xchg_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-LABEL: global_atomic_xchg_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -10412,11 +10413,11 @@ define void @global_atomic_add_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX90A-LABEL: global_atomic_add_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10430,10 +10431,11 @@ define void @global_atomic_add_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX950-LABEL: global_atomic_add_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -10490,11 +10492,11 @@ define void @global_atomic_sub_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX90A-LABEL: global_atomic_sub_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10508,10 +10510,11 @@ define void @global_atomic_sub_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX950-LABEL: global_atomic_sub_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -10568,11 +10571,11 @@ define void @global_atomic_and_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX90A-LABEL: global_atomic_and_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10586,10 +10589,11 @@ define void @global_atomic_and_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX950-LABEL: global_atomic_and_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -10788,11 +10792,11 @@ define void @global_atomic_or_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX90A-LABEL: global_atomic_or_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10806,10 +10810,11 @@ define void @global_atomic_or_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX950-LABEL: global_atomic_or_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -10866,11 +10871,11 @@ define void @global_atomic_xor_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX90A-LABEL: global_atomic_xor_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10884,10 +10889,11 @@ define void @global_atomic_xor_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX950-LABEL: global_atomic_xor_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -10944,11 +10950,11 @@ define void @global_atomic_max_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX90A-LABEL: global_atomic_max_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10962,10 +10968,11 @@ define void @global_atomic_max_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX950-LABEL: global_atomic_max_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -11022,11 +11029,11 @@ define void @global_atomic_min_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX90A-LABEL: global_atomic_min_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11040,10 +11047,11 @@ define void @global_atomic_min_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX950-LABEL: global_atomic_min_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -11100,11 +11108,11 @@ define void @global_atomic_umax_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-LABEL: global_atomic_umax_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11118,10 +11126,11 @@ define void @global_atomic_umax_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-LABEL: global_atomic_umax_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -11178,11 +11187,11 @@ define void @global_atomic_umin_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-LABEL: global_atomic_umin_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11196,10 +11205,11 @@ define void @global_atomic_umin_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-LABEL: global_atomic_umin_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -11256,11 +11266,11 @@ define void @global_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX90A-LABEL: global_atomic_uinc_wrap_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11274,10 +11284,11 @@ define void @global_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX950-LABEL: global_atomic_uinc_wrap_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -11334,11 +11345,11 @@ define void @global_atomic_udec_wrap_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX90A-LABEL: global_atomic_udec_wrap_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11352,10 +11363,11 @@ define void @global_atomic_udec_wrap_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX950-LABEL: global_atomic_udec_wrap_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -12449,11 +12461,11 @@ define void @global_atomic_fadd_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-LABEL: global_atomic_fadd_f64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -12467,10 +12479,11 @@ define void @global_atomic_fadd_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-LABEL: global_atomic_fadd_f64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_add_f64 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -12657,11 +12670,11 @@ define void @global_atomic_fmax_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-LABEL: global_atomic_fmax_f64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -12675,10 +12688,11 @@ define void @global_atomic_fmax_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-LABEL: global_atomic_fmax_f64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -12735,11 +12749,11 @@ define void @global_atomic_fmin_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-LABEL: global_atomic_fmin_f64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -12753,10 +12767,11 @@ define void @global_atomic_fmin_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-LABEL: global_atomic_fmin_f64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll index 3194581fa4213..c09bdcf97e0ab 100644 --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -245,11 +245,11 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) # ; FIXEDABI-LABEL: marked_kernel_use_other_sgpr: ; FIXEDABI: ; %bb.0: ; FIXEDABI-NEXT: s_add_i32 s6, s6, s11 -; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7 ; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7 ; FIXEDABI-NEXT: s_add_u32 s0, s4, 8 -; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc ; FIXEDABI-NEXT: s_addc_u32 s1, s5, 0 +; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) ; FIXEDABI-NEXT: v_mov_b32_e32 v0, s0 ; FIXEDABI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir index d22a4b978980f..a8b82d8fd16e5 100644 --- a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir @@ -88,22 +88,22 @@ body: | ; GFX908-LABEL: name: a2_to_v2 ; GFX908: liveins: $agpr0_agpr1 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1 - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1 ; ; GFX90A-LABEL: name: a2_to_v2 ; GFX90A: liveins: $agpr0_agpr1 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1 - ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1 ; ; GFX942-LABEL: name: a2_to_v2 ; GFX942: liveins: $agpr0_agpr1 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1 - ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1 $vgpr0_vgpr1 = COPY killed $agpr0_agpr1, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1 @@ -119,25 +119,25 @@ body: | ; GFX908-LABEL: name: a3_to_v3 ; GFX908: liveins: $agpr0_agpr1_agpr2 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $agpr0_agpr1_agpr2 - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2 - ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2 ; ; GFX90A-LABEL: name: a3_to_v3 ; GFX90A: liveins: $agpr0_agpr1_agpr2 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $agpr0_agpr1_agpr2 - ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2 - ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2 ; ; GFX942-LABEL: name: a3_to_v3 ; GFX942: liveins: $agpr0_agpr1_agpr2 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $agpr0_agpr1_agpr2 - ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2 - ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 = COPY killed $agpr0_agpr1_agpr2, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2 @@ -152,28 +152,28 @@ body: | ; GFX908-LABEL: name: a4_to_v4 ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3 ; ; GFX90A-LABEL: name: a4_to_v4 ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3 ; ; GFX942-LABEL: name: a4_to_v4 ; GFX942: liveins: $agpr0_agpr1_agpr2_agpr3 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX942-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $agpr0_agpr1_agpr2_agpr3, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3 @@ -189,40 +189,40 @@ body: | ; GFX908-LABEL: name: a8_to_v8 ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec + ; GFX908-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec + ; GFX908-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec + ; GFX908-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec + ; GFX908-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; ; GFX90A-LABEL: name: a8_to_v8 ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec + ; GFX90A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec + ; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec + ; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; ; GFX942-LABEL: name: a8_to_v8 ; GFX942: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX942-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec + ; GFX942-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec + ; GFX942-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 @@ -237,64 +237,64 @@ body: | ; GFX908-LABEL: name: a16_to_v16 ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec + ; GFX908-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec + ; GFX908-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec + ; GFX908-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec + ; GFX908-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec + ; GFX908-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec + ; GFX908-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec + ; GFX908-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec + ; GFX908-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec + ; GFX908-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec + ; GFX908-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec + ; GFX908-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec + ; GFX908-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; ; GFX90A-LABEL: name: a16_to_v16 ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec + ; GFX90A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec + ; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec + ; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec + ; GFX90A-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec + ; GFX90A-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec + ; GFX90A-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec + ; GFX90A-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec + ; GFX90A-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec + ; GFX90A-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec + ; GFX90A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec + ; GFX90A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; ; GFX942-LABEL: name: a16_to_v16 ; GFX942: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX942-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec + ; GFX942-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec + ; GFX942-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec + ; GFX942-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec + ; GFX942-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec + ; GFX942-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec + ; GFX942-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec + ; GFX942-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec + ; GFX942-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec + ; GFX942-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec + ; GFX942-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -336,22 +336,22 @@ body: | ; GFX908-LABEL: name: v2_to_a2 ; GFX908: liveins: $vgpr0_vgpr1 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1 - ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1 ; ; GFX90A-LABEL: name: v2_to_a2 ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1 ; ; GFX942-LABEL: name: v2_to_a2 ; GFX942: liveins: $vgpr0_vgpr1 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1 $agpr0_agpr1 = COPY killed $vgpr0_vgpr1, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1 @@ -366,25 +366,25 @@ body: | ; GFX908-LABEL: name: v3_to_a3 ; GFX908: liveins: $vgpr0_vgpr1_vgpr2 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $vgpr0_vgpr1_vgpr2 - ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 - ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ; ; GFX90A-LABEL: name: v3_to_a3 ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $vgpr0_vgpr1_vgpr2 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ; ; GFX942-LABEL: name: v3_to_a3 ; GFX942: liveins: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $vgpr0_vgpr1_vgpr2 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 $agpr0_agpr1_agpr2 = COPY killed $vgpr0_vgpr1_vgpr2, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 @@ -399,28 +399,28 @@ body: | ; GFX908-LABEL: name: v4_to_a4 ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 ; ; GFX90A-LABEL: name: v4_to_a4 ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 ; ; GFX942-LABEL: name: v4_to_a4 ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 $agpr0_agpr1_agpr2_agpr3 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 @@ -435,40 +435,40 @@ body: | ; GFX908-LABEL: name: v8_to_a8 ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec + ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec + ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec + ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec + ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; ; GFX90A-LABEL: name: v8_to_a8 ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec + ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec + ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec + ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec + ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; ; GFX942-LABEL: name: v8_to_a8 ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 @@ -483,64 +483,64 @@ body: | ; GFX908-LABEL: name: v16_to_a16 ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec + ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec + ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec + ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec + ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec + ; GFX908-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec + ; GFX908-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec + ; GFX908-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec + ; GFX908-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec + ; GFX908-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec + ; GFX908-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec + ; GFX908-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec + ; GFX908-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; ; GFX90A-LABEL: name: v16_to_a16 ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec + ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec + ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec + ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec + ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec + ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec + ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec + ; GFX90A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec + ; GFX90A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec + ; GFX90A-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec + ; GFX90A-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec + ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec + ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; ; GFX942-LABEL: name: v16_to_a16 ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec + ; GFX942-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec + ; GFX942-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec + ; GFX942-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec + ; GFX942-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec + ; GFX942-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec + ; GFX942-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec + ; GFX942-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec + ; GFX942-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 @@ -583,24 +583,24 @@ body: | ; GFX908-LABEL: name: s2_to_a2 ; GFX908: liveins: $sgpr0_sgpr1 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1 - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1 - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1 ; ; GFX90A-LABEL: name: s2_to_a2 ; GFX90A: liveins: $sgpr0_sgpr1 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $sgpr0_sgpr1 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1 ; ; GFX942-LABEL: name: s2_to_a2 ; GFX942: liveins: $sgpr0_sgpr1 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $sgpr0_sgpr1 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1 $agpr0_agpr1 = COPY killed $sgpr0_sgpr1, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1 @@ -615,28 +615,28 @@ body: | ; GFX908-LABEL: name: s3_to_a3 ; GFX908: liveins: $sgpr0_sgpr1_sgpr2 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ; ; GFX90A-LABEL: name: s3_to_a3 ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $sgpr0_sgpr1_sgpr2 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ; ; GFX942-LABEL: name: s3_to_a3 ; GFX942: liveins: $sgpr0_sgpr1_sgpr2 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $sgpr0_sgpr1_sgpr2 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 $agpr0_agpr1_agpr2 = COPY killed $sgpr0_sgpr1_sgpr2, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 @@ -651,32 +651,32 @@ body: | ; GFX908-LABEL: name: s4_to_a4 ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr3, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 ; ; GFX90A-LABEL: name: s4_to_a4 ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 ; ; GFX942-LABEL: name: s4_to_a4 ; GFX942: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 $agpr0_agpr1_agpr2_agpr3 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 @@ -691,40 +691,40 @@ body: | ; GFX908-LABEL: name: s6_to_a6 ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr3, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 ; ; GFX90A-LABEL: name: s6_to_a6 ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec + ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr4, implicit $exec + ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr5, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 ; ; GFX942-LABEL: name: s6_to_a6 ; GFX942: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr4, implicit $exec + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr5, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 @@ -739,48 +739,48 @@ body: | ; GFX908-LABEL: name: s8_to_a8 ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr3, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr6, implicit $exec ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr7, implicit $exec ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; ; GFX90A-LABEL: name: s8_to_a8 ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec + ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr4, implicit $exec + ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr5, implicit $exec + ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr6, implicit $exec + ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr7, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; ; GFX942-LABEL: name: s8_to_a8 ; GFX942: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr4, implicit $exec + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr5, implicit $exec + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr6, implicit $exec + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr7, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 @@ -795,80 +795,80 @@ body: | ; GFX908-LABEL: name: s16_to_a16 ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr3, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr6, implicit $exec ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr7, implicit $exec ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec ; GFX908-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr9, implicit $exec ; GFX908-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr10, implicit $exec ; GFX908-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr11, implicit $exec ; GFX908-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr12, implicit $exec ; GFX908-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr13, implicit $exec ; GFX908-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr14, implicit $exec ; GFX908-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr15, implicit $exec ; GFX908-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; ; GFX90A-LABEL: name: s16_to_a16 ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec + ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr4, implicit $exec + ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr5, implicit $exec + ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr6, implicit $exec + ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr7, implicit $exec + ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr8, implicit $exec + ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr9, implicit $exec + ; GFX90A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr10, implicit $exec + ; GFX90A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr11, implicit $exec + ; GFX90A-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr12, implicit $exec + ; GFX90A-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr13, implicit $exec + ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr14, implicit $exec + ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr15, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; ; GFX942-LABEL: name: s16_to_a16 ; GFX942: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr4, implicit $exec + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr5, implicit $exec + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr6, implicit $exec + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr7, implicit $exec + ; GFX942-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr8, implicit $exec + ; GFX942-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr9, implicit $exec + ; GFX942-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr10, implicit $exec + ; GFX942-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr11, implicit $exec + ; GFX942-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr12, implicit $exec + ; GFX942-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr13, implicit $exec + ; GFX942-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr14, implicit $exec + ; GFX942-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr15, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 @@ -908,9 +908,9 @@ body: | ; GFX908-LABEL: name: a2_to_a2 ; GFX908: liveins: $agpr0_agpr1 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1 - ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 @@ -918,16 +918,16 @@ body: | ; GFX90A-LABEL: name: a2_to_a2 ; GFX90A: liveins: $agpr0_agpr1 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr1_agpr2, implicit $agpr0_agpr1 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $exec ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 ; ; GFX942-LABEL: name: a2_to_a2 ; GFX942: liveins: $agpr0_agpr1 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr1_agpr2, implicit $agpr0_agpr1 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $exec ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 $agpr1_agpr2 = COPY $agpr0_agpr1, implicit $exec @@ -944,9 +944,9 @@ body: | ; GFX908-LABEL: name: a2_to_a2_kill ; GFX908: liveins: $agpr0_agpr1 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1 - ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 @@ -954,16 +954,16 @@ body: | ; GFX90A-LABEL: name: a2_to_a2_kill ; GFX90A: liveins: $agpr0_agpr1 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr1_agpr2, implicit $agpr0_agpr1 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $exec ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 ; ; GFX942-LABEL: name: a2_to_a2_kill ; GFX942: liveins: $agpr0_agpr1 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr1_agpr2, implicit $agpr0_agpr1 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $exec ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 $agpr1_agpr2 = COPY killed $agpr0_agpr1, implicit $exec @@ -984,9 +984,9 @@ body: | ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2 ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec, implicit-def $agpr1_agpr2 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr1_agpr2 - ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr3_agpr4 - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit killed $agpr1_agpr2 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: a2_to_a2_implicit_defs @@ -996,8 +996,8 @@ body: | ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2 ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec, implicit-def $agpr1_agpr2 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit-def $agpr3_agpr4, implicit $agpr1_agpr2 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit killed $agpr1_agpr2, implicit $exec + ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 killed $agpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: a2_to_a2_implicit_defs ; GFX942: liveins: $agpr0_agpr1 @@ -1006,8 +1006,8 @@ body: | ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2 ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec, implicit-def $agpr1_agpr2 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit-def $agpr3_agpr4, implicit $agpr1_agpr2 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit killed $agpr1_agpr2, implicit $exec + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 killed $agpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1 $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2 $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 @@ -1024,28 +1024,28 @@ body: | ; GFX908-LABEL: name: a3_to_a3_nonoverlap_kill ; GFX908: liveins: $agpr4_agpr5_agpr6 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr4_agpr5_agpr6 - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ; ; GFX90A-LABEL: name: a3_to_a3_nonoverlap_kill ; GFX90A: liveins: $agpr4_agpr5_agpr6 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr4_agpr5_agpr6 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6 + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr4, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 killed $agpr5, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 killed $agpr6, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ; ; GFX942-LABEL: name: a3_to_a3_nonoverlap_kill ; GFX942: liveins: $agpr4_agpr5_agpr6 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr4_agpr5_agpr6 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6 + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr4, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 killed $agpr5, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 killed $agpr6, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 $agpr0_agpr1_agpr2 = COPY killed $agpr4_agpr5_agpr6 S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 @@ -1060,11 +1060,11 @@ body: | ; GFX908-LABEL: name: a3_to_a3_overlap_kill ; GFX908: liveins: $agpr1_agpr2_agpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1 @@ -1072,18 +1072,18 @@ body: | ; GFX90A-LABEL: name: a3_to_a3_overlap_kill ; GFX90A: liveins: $agpr1_agpr2_agpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3 + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1 ; ; GFX942-LABEL: name: a3_to_a3_overlap_kill ; GFX942: liveins: $agpr1_agpr2_agpr3 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3 + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1 $agpr0_agpr1_agpr2 = COPY killed $agpr1_agpr2_agpr3 @@ -1098,30 +1098,30 @@ body: | bb.0: ; GFX908-LABEL: name: a4_to_a4 ; GFX908: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5 ; ; GFX90A-LABEL: name: a4_to_a4 ; GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec + ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5 ; ; GFX942-LABEL: name: a4_to_a4 ; GFX942: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5 $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF $agpr2_agpr3_agpr4_agpr5 = COPY killed $agpr0_agpr1_agpr2_agpr3, implicit $exec @@ -1137,32 +1137,32 @@ body: | ; GFX908-LABEL: name: a4_to_a4_overlap ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 ; ; GFX90A-LABEL: name: a4_to_a4_overlap ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec + ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 ; ; GFX942-LABEL: name: a4_to_a4_overlap ; GFX942: liveins: $agpr0_agpr1_agpr2_agpr3 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 $agpr2_agpr3_agpr4_agpr5 = COPY $agpr0_agpr1_agpr2_agpr3, implicit $exec S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 @@ -1175,46 +1175,46 @@ body: | bb.0: ; GFX908-LABEL: name: a8_to_a8 ; GFX908: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec + ; GFX908-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec ; GFX908-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec ; GFX908-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec ; GFX908-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec ; GFX908-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec ; GFX908-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec ; GFX908-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec ; GFX908-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; ; GFX90A-LABEL: name: a8_to_a8 ; GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF - ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $agpr13 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $agpr12 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $agpr11 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $agpr10 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec + ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_MOV_B32 killed $agpr7, implicit $exec + ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_MOV_B32 killed $agpr6, implicit $exec + ; GFX90A-NEXT: $agpr13 = V_ACCVGPR_MOV_B32 killed $agpr5, implicit $exec + ; GFX90A-NEXT: $agpr12 = V_ACCVGPR_MOV_B32 killed $agpr4, implicit $exec + ; GFX90A-NEXT: $agpr11 = V_ACCVGPR_MOV_B32 killed $agpr3, implicit $exec + ; GFX90A-NEXT: $agpr10 = V_ACCVGPR_MOV_B32 killed $agpr2, implicit $exec + ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec + ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_MOV_B32 killed $agpr0, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; ; GFX942-LABEL: name: a8_to_a8 ; GFX942: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF - ; GFX942-NEXT: $agpr15 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $agpr14 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $agpr13 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $agpr12 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $agpr11 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $agpr10 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $agpr9 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $agpr8 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec + ; GFX942-NEXT: $agpr15 = V_ACCVGPR_MOV_B32 killed $agpr7, implicit $exec + ; GFX942-NEXT: $agpr14 = V_ACCVGPR_MOV_B32 killed $agpr6, implicit $exec + ; GFX942-NEXT: $agpr13 = V_ACCVGPR_MOV_B32 killed $agpr5, implicit $exec + ; GFX942-NEXT: $agpr12 = V_ACCVGPR_MOV_B32 killed $agpr4, implicit $exec + ; GFX942-NEXT: $agpr11 = V_ACCVGPR_MOV_B32 killed $agpr3, implicit $exec + ; GFX942-NEXT: $agpr10 = V_ACCVGPR_MOV_B32 killed $agpr2, implicit $exec + ; GFX942-NEXT: $agpr9 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec + ; GFX942-NEXT: $agpr8 = V_ACCVGPR_MOV_B32 killed $agpr0, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec @@ -1229,78 +1229,78 @@ body: | ; GFX908-LABEL: name: a16_to_a16 ; GFX908: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec + ; GFX908-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec ; GFX908-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec ; GFX908-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec ; GFX908-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec ; GFX908-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec ; GFX908-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec ; GFX908-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec ; GFX908-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec ; GFX908-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec ; GFX908-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec ; GFX908-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec ; GFX908-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec ; GFX908-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec ; GFX908-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec ; GFX908-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec ; GFX908-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 ; ; GFX90A-LABEL: name: a16_to_a16 ; GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $agpr31 = V_ACCVGPR_MOV_B32 $agpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr30 = V_ACCVGPR_MOV_B32 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr29 = V_ACCVGPR_MOV_B32 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr28 = V_ACCVGPR_MOV_B32 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr27 = V_ACCVGPR_MOV_B32 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr26 = V_ACCVGPR_MOV_B32 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr25 = V_ACCVGPR_MOV_B32 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr24 = V_ACCVGPR_MOV_B32 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr23 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr22 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr21 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr20 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr19 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr18 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr17 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr16 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec + ; GFX90A-NEXT: $agpr31 = V_ACCVGPR_MOV_B32 killed $agpr15, implicit $exec + ; GFX90A-NEXT: $agpr30 = V_ACCVGPR_MOV_B32 killed $agpr14, implicit $exec + ; GFX90A-NEXT: $agpr29 = V_ACCVGPR_MOV_B32 killed $agpr13, implicit $exec + ; GFX90A-NEXT: $agpr28 = V_ACCVGPR_MOV_B32 killed $agpr12, implicit $exec + ; GFX90A-NEXT: $agpr27 = V_ACCVGPR_MOV_B32 killed $agpr11, implicit $exec + ; GFX90A-NEXT: $agpr26 = V_ACCVGPR_MOV_B32 killed $agpr10, implicit $exec + ; GFX90A-NEXT: $agpr25 = V_ACCVGPR_MOV_B32 killed $agpr9, implicit $exec + ; GFX90A-NEXT: $agpr24 = V_ACCVGPR_MOV_B32 killed $agpr8, implicit $exec + ; GFX90A-NEXT: $agpr23 = V_ACCVGPR_MOV_B32 killed $agpr7, implicit $exec + ; GFX90A-NEXT: $agpr22 = V_ACCVGPR_MOV_B32 killed $agpr6, implicit $exec + ; GFX90A-NEXT: $agpr21 = V_ACCVGPR_MOV_B32 killed $agpr5, implicit $exec + ; GFX90A-NEXT: $agpr20 = V_ACCVGPR_MOV_B32 killed $agpr4, implicit $exec + ; GFX90A-NEXT: $agpr19 = V_ACCVGPR_MOV_B32 killed $agpr3, implicit $exec + ; GFX90A-NEXT: $agpr18 = V_ACCVGPR_MOV_B32 killed $agpr2, implicit $exec + ; GFX90A-NEXT: $agpr17 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec + ; GFX90A-NEXT: $agpr16 = V_ACCVGPR_MOV_B32 killed $agpr0, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 ; ; GFX942-LABEL: name: a16_to_a16 ; GFX942: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF - ; GFX942-NEXT: $agpr31 = V_ACCVGPR_MOV_B32 $agpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr30 = V_ACCVGPR_MOV_B32 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr29 = V_ACCVGPR_MOV_B32 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr28 = V_ACCVGPR_MOV_B32 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr27 = V_ACCVGPR_MOV_B32 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr26 = V_ACCVGPR_MOV_B32 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr25 = V_ACCVGPR_MOV_B32 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr24 = V_ACCVGPR_MOV_B32 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr23 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr22 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr21 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr20 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr19 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr18 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr17 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr16 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec + ; GFX942-NEXT: $agpr31 = V_ACCVGPR_MOV_B32 killed $agpr15, implicit $exec + ; GFX942-NEXT: $agpr30 = V_ACCVGPR_MOV_B32 killed $agpr14, implicit $exec + ; GFX942-NEXT: $agpr29 = V_ACCVGPR_MOV_B32 killed $agpr13, implicit $exec + ; GFX942-NEXT: $agpr28 = V_ACCVGPR_MOV_B32 killed $agpr12, implicit $exec + ; GFX942-NEXT: $agpr27 = V_ACCVGPR_MOV_B32 killed $agpr11, implicit $exec + ; GFX942-NEXT: $agpr26 = V_ACCVGPR_MOV_B32 killed $agpr10, implicit $exec + ; GFX942-NEXT: $agpr25 = V_ACCVGPR_MOV_B32 killed $agpr9, implicit $exec + ; GFX942-NEXT: $agpr24 = V_ACCVGPR_MOV_B32 killed $agpr8, implicit $exec + ; GFX942-NEXT: $agpr23 = V_ACCVGPR_MOV_B32 killed $agpr7, implicit $exec + ; GFX942-NEXT: $agpr22 = V_ACCVGPR_MOV_B32 killed $agpr6, implicit $exec + ; GFX942-NEXT: $agpr21 = V_ACCVGPR_MOV_B32 killed $agpr5, implicit $exec + ; GFX942-NEXT: $agpr20 = V_ACCVGPR_MOV_B32 killed $agpr4, implicit $exec + ; GFX942-NEXT: $agpr19 = V_ACCVGPR_MOV_B32 killed $agpr3, implicit $exec + ; GFX942-NEXT: $agpr18 = V_ACCVGPR_MOV_B32 killed $agpr2, implicit $exec + ; GFX942-NEXT: $agpr17 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec + ; GFX942-NEXT: $agpr16 = V_ACCVGPR_MOV_B32 killed $agpr0, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec @@ -1353,37 +1353,29 @@ body: | ; GFX908: liveins: $agpr0, $sgpr2_sgpr3 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec + ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr2, implicit $exec + ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; ; GFX90A-LABEL: name: copy_sgpr_to_agpr_tuple ; GFX90A: liveins: $agpr0, $sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec + ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; ; GFX942-LABEL: name: copy_sgpr_to_agpr_tuple ; GFX942: liveins: $agpr0, $sgpr2_sgpr3 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 S_NOP 0, implicit-def dead $sgpr0_sgpr1 - renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + renamable $agpr4_agpr5_agpr6_agpr7 = COPY_LANEMASK renamable $sgpr0_sgpr1_sgpr2_sgpr3, lanemask(240), implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ... --- @@ -1397,37 +1389,29 @@ body: | ; GFX908: liveins: $agpr0, $sgpr2_sgpr3 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec + ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 ; ; GFX90A-LABEL: name: copy_sgpr_to_agpr_tuple_kill ; GFX90A: liveins: $agpr0, $sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec + ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 ; ; GFX942-LABEL: name: copy_sgpr_to_agpr_tuple_kill ; GFX942: liveins: $agpr0, $sgpr2_sgpr3 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 S_NOP 0, implicit-def dead $sgpr0_sgpr1 - renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + renamable $agpr4_agpr5_agpr6_agpr7 = COPY_LANEMASK renamable killed $sgpr0_sgpr1_sgpr2_sgpr3, lanemask(240), implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 ... @@ -1442,37 +1426,29 @@ body: | ; GFX908: liveins: $agpr0, $agpr2_agpr3 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 ; ; GFX90A-LABEL: name: copy_agpr_to_agpr_tuple ; GFX90A: liveins: $agpr0, $agpr2_agpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec + ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 ; ; GFX942-LABEL: name: copy_agpr_to_agpr_tuple ; GFX942: liveins: $agpr0, $agpr2_agpr3 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GFX942-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 S_NOP 0, implicit-def dead $agpr0_agpr1 - renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec + renamable $agpr4_agpr5_agpr6_agpr7 = COPY_LANEMASK renamable $agpr0_agpr1_agpr2_agpr3, lanemask(240), implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 ... @@ -1487,37 +1463,29 @@ body: | ; GFX908: liveins: $agpr0, $agpr2_agpr3 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec + ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 ; ; GFX90A-LABEL: name: copy_agpr_to_agpr_tuple_kill ; GFX90A: liveins: $agpr0, $agpr2_agpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 killed $agpr3, implicit $exec + ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 killed $agpr2, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 ; ; GFX942-LABEL: name: copy_agpr_to_agpr_tuple_kill ; GFX942: liveins: $agpr0, $agpr2_agpr3 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GFX942-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 killed $agpr3, implicit $exec + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 killed $agpr2, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 S_NOP 0, implicit-def dead $agpr0_agpr1 - renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $agpr0_agpr1_agpr2_agpr3, implicit $exec + renamable $agpr4_agpr5_agpr6_agpr7 = COPY_LANEMASK renamable killed $agpr0_agpr1_agpr2_agpr3, lanemask(240), implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 ... diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll index b8814b64735e6..2701fce7f90b8 100644 --- a/llvm/test/CodeGen/AMDGPU/add.ll +++ b/llvm/test/CodeGen/AMDGPU/add.ll @@ -344,9 +344,9 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s9 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 @@ -541,20 +541,20 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v0, s17 ; GFX8-NEXT: v_mov_b32_e32 v1, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s15 ; GFX8-NEXT: v_mov_b32_e32 v1, s14 ; GFX8-NEXT: v_mov_b32_e32 v2, s13 ; GFX8-NEXT: v_mov_b32_e32 v3, s12 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s3 @@ -1021,8 +1021,9 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s2, s4 ; GFX11-NEXT: s_addc_u32 s3, s3, s5 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -1033,8 +1034,9 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -1123,8 +1125,9 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s2, s4 ; GFX11-NEXT: s_addc_u32 s3, s3, s5 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -1137,8 +1140,9 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -1166,10 +1170,10 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: .LBB9_2: ; %if ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: .LBB9_3: ; %endif -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -1192,10 +1196,10 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: .LBB9_2: ; %if ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX8-NEXT: .LBB9_3: ; %endif -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm @@ -1218,9 +1222,9 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: .LBB9_2: ; %if ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[10:11], 0x0 ; GFX9-NEXT: .LBB9_3: ; %endif +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm @@ -1241,9 +1245,9 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX10-NEXT: .LBB9_2: ; %if ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[10:11], 0x0 ; GFX10-NEXT: .LBB9_3: ; %endif +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm @@ -1265,8 +1269,8 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX11-NEXT: .LBB9_3: ; %endif ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB9_4: @@ -1286,8 +1290,8 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX12-NEXT: .LBB9_3: ; %endif ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm ; GFX12-NEXT: .LBB9_4: diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index ef7a13819a799..9a3a04a622086 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -568,19 +568,19 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 ; GFX908-NEXT: s_mov_b32 s13, s12 +; GFX908-NEXT: v_mov_b32_e32 v5, s13 +; GFX908-NEXT: v_mov_b32_e32 v7, s13 +; GFX908-NEXT: v_mov_b32_e32 v9, s13 +; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0 ; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] ; GFX908-NEXT: v_mov_b32_e32 v4, s12 +; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15] ; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v6 ; GFX908-NEXT: v_mov_b32_e32 v6, s12 ; GFX908-NEXT: v_mov_b32_e32 v8, s12 -; GFX908-NEXT: v_mov_b32_e32 v5, s13 -; GFX908-NEXT: v_mov_b32_e32 v7, s13 -; GFX908-NEXT: v_mov_b32_e32 v9, s13 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 -; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15] ; GFX908-NEXT: v_mov_b32_e32 v10, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s9, v2 diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir index 950382758ffbc..945a8faf8ebae 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir @@ -34,17 +34,17 @@ body: | ; GFX908-LABEL: name: no_free_vgprs_for_copy_a64_to_a64 ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1 - ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec, implicit-def $agpr2_agpr3 - ; GFX908-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 + ; GFX908-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec + ; GFX908-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3 ; ; GFX90A-LABEL: name: no_free_vgprs_for_copy_a64_to_a64 ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr2_agpr3, implicit $agpr0_agpr1 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1 + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3 $agpr2_agpr3 = COPY $agpr0_agpr1 S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3 diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-reuse-writes.mir b/llvm/test/CodeGen/AMDGPU/agpr-copy-reuse-writes.mir index 1573903945a3e..603179b7063f7 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-reuse-writes.mir +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-reuse-writes.mir @@ -15,10 +15,10 @@ body: | ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec - ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec + ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec @@ -47,8 +47,8 @@ body: | ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit killed $agpr0_agpr1 ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1 ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1 - ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr2_agpr3, implicit $agpr0_agpr1 - ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $agpr0_agpr1, implicit $exec + ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0 $agpr0_agpr1 = IMPLICIT_DEF SI_SPILL_AV64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) @@ -66,13 +66,13 @@ body: | ; GFX908-LABEL: name: overlapping_agpr ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr1_agpr2_agpr3_agpr4 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr1_agpr2_agpr3_agpr4 $agpr1_agpr2_agpr3_agpr4 = COPY $agpr0_agpr1_agpr2_agpr3, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir b/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir index a9d31c1c45b0e..da8e368f5ac47 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir @@ -35,17 +35,17 @@ body: | ; GFX908-LABEL: name: no_free_vgprs_for_copy_s64_to_a64 ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8_sgpr9 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr63 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr8_sgpr9 - ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec, implicit-def $agpr2_agpr3 - ; GFX908-NEXT: $vgpr63 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9 + ; GFX908-NEXT: $vgpr63 = V_MOV_B32_e32 $sgpr8, implicit $exec + ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec + ; GFX908-NEXT: $vgpr63 = V_MOV_B32_e32 $sgpr9, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3 ; ; GFX90A-LABEL: name: no_free_vgprs_for_copy_s64_to_a64 ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8_sgpr9 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit-def $agpr2_agpr3, implicit $sgpr8_sgpr9 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9 + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3 $agpr2_agpr3 = COPY $sgpr8_sgpr9 S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3 diff --git a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll index 63b7b70548baf..3c9b08d26288d 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll @@ -124,8 +124,8 @@ define amdgpu_kernel void @test_call_empty() #0 { ; GFX90A-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX90A-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: ;;#ASMSTART @@ -167,14 +167,14 @@ define amdgpu_kernel void @test_call_empty() #0 { ; GFX908-NEXT: s_add_u32 s4, s4, func_empty@gotpcrel32@lo+4 ; GFX908-NEXT: s_addc_u32 s5, s5, func_empty@gotpcrel32@hi+12 ; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX908-NEXT: s_mov_b32 s14, s10 -; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX908-NEXT: s_mov_b32 s14, s10 +; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_mov_b64 s[0:1], s[20:21] -; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: ;;#ASMSTART @@ -259,8 +259,8 @@ define amdgpu_kernel void @test_call_areg4() #0 { ; GFX90A-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX90A-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: ;;#ASMSTART @@ -302,14 +302,14 @@ define amdgpu_kernel void @test_call_areg4() #0 { ; GFX908-NEXT: s_add_u32 s4, s4, func_areg_4@gotpcrel32@lo+4 ; GFX908-NEXT: s_addc_u32 s5, s5, func_areg_4@gotpcrel32@hi+12 ; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX908-NEXT: s_mov_b32 s14, s10 -; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX908-NEXT: s_mov_b32 s14, s10 +; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_mov_b64 s[0:1], s[20:21] -; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: ;;#ASMSTART @@ -394,8 +394,8 @@ define amdgpu_kernel void @test_call_areg32() #0 { ; GFX90A-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX90A-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: ;;#ASMSTART @@ -437,14 +437,14 @@ define amdgpu_kernel void @test_call_areg32() #0 { ; GFX908-NEXT: s_add_u32 s4, s4, func_areg_32@gotpcrel32@lo+4 ; GFX908-NEXT: s_addc_u32 s5, s5, func_areg_32@gotpcrel32@hi+12 ; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX908-NEXT: s_mov_b32 s14, s10 -; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX908-NEXT: s_mov_b32 s14, s10 +; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_mov_b64 s[0:1], s[20:21] -; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: ;;#ASMSTART @@ -529,8 +529,8 @@ define amdgpu_kernel void @test_call_areg64() #0 { ; GFX90A-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX90A-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: ;;#ASMSTART @@ -572,14 +572,14 @@ define amdgpu_kernel void @test_call_areg64() #0 { ; GFX908-NEXT: s_add_u32 s4, s4, func_areg_64@gotpcrel32@lo+4 ; GFX908-NEXT: s_addc_u32 s5, s5, func_areg_64@gotpcrel32@hi+12 ; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX908-NEXT: s_mov_b32 s14, s10 -; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX908-NEXT: s_mov_b32 s14, s10 +; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_mov_b64 s[0:1], s[20:21] -; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: ;;#ASMSTART @@ -664,8 +664,8 @@ define amdgpu_kernel void @test_call_areg31_63() #0 { ; GFX90A-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX90A-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: ;;#ASMSTART @@ -707,14 +707,14 @@ define amdgpu_kernel void @test_call_areg31_63() #0 { ; GFX908-NEXT: s_add_u32 s4, s4, func_areg_31_63@gotpcrel32@lo+4 ; GFX908-NEXT: s_addc_u32 s5, s5, func_areg_31_63@gotpcrel32@hi+12 ; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX908-NEXT: s_mov_b32 s14, s10 -; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX908-NEXT: s_mov_b32 s14, s10 +; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_mov_b64 s[0:1], s[20:21] -; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: ;;#ASMSTART @@ -799,8 +799,8 @@ define amdgpu_kernel void @test_call_unknown() #0 { ; GFX90A-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX90A-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: ;;#ASMSTART @@ -842,14 +842,14 @@ define amdgpu_kernel void @test_call_unknown() #0 { ; GFX908-NEXT: s_add_u32 s4, s4, func_unknown@gotpcrel32@lo+4 ; GFX908-NEXT: s_addc_u32 s5, s5, func_unknown@gotpcrel32@hi+12 ; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX908-NEXT: s_mov_b32 s14, s10 -; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX908-NEXT: s_mov_b32 s14, s10 +; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll index 689b306518c9b..1b7604c887b25 100644 --- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll @@ -19,8 +19,8 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt ; GCN-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-NEXT: s_add_u32 s0, s2, 40 ; GCN-NEXT: s_addc_u32 s1, s3, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 4c5c56a49fdc6..976522f7d5ea4 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -161095,7 +161095,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v9 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_mov_b32_e32 v59, v32 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_mov_b32_e32 v58, v31 @@ -161129,6 +161128,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v31, vcc ; GFX9-NEXT: v_bfe_u32 v31, v1, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_mov_b32_e32 v59, v32 ; GFX9-NEXT: v_add3_u32 v31, v31, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -161443,15 +161443,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: v_mov_b32_e32 v34, v62 ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v52 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v50 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 ; GFX9-NEXT: v_mov_b32_e32 v35, v63 @@ -161462,8 +161460,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v48 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v63, v16 +; GFX9-NEXT: v_mov_b32_e32 v34, v62 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40 ; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill @@ -161471,6 +161470,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48 ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v63, v16 ; GFX9-NEXT: v_mov_b32_e32 v62, v15 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v35 @@ -185476,6 +185476,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v27, v55 ; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v10, v32 ; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_add_f16_sdwa v60, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 @@ -185554,10 +185555,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v48, v8, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v31 ; VI-NEXT: v_add_f16_sdwa v8, v43, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v10, v32 ; VI-NEXT: v_add_f16_e32 v43, 0x200, v43 +; VI-NEXT: v_mov_b32_e32 v9, v31 ; VI-NEXT: v_or_b32_e32 v51, v3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; VI-NEXT: v_or_b32_e32 v50, v2, v0 @@ -185672,21 +185672,19 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v7 -; VI-NEXT: v_mov_b32_e32 v32, v10 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v6 -; VI-NEXT: v_mov_b32_e32 v31, v9 +; VI-NEXT: v_mov_b32_e32 v32, v10 ; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v41 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v11 ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[40:41] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, v27 ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v54, v26 ; VI-NEXT: v_mov_b32_e32 v26, v20 ; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v5 @@ -185734,6 +185732,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[61:62] ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v57 +; VI-NEXT: v_mov_b32_e32 v31, v9 ; VI-NEXT: v_mov_b32_e32 v9, v23 ; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v40 ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill @@ -185745,6 +185744,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v38 ; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v37 ; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[37:38] +; VI-NEXT: v_mov_b32_e32 v55, v27 ; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v45 ; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[52:53] ; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v51 @@ -218732,8 +218732,6 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB100_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v47, v15 :: v_dual_mov_b32 v46, v14 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v48 :: v_dual_mov_b32 v17, v49 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v45, v13 :: v_dual_mov_b32 v44, v12 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v43, v11 :: v_dual_mov_b32 v42, v10 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v41, v9 :: v_dual_mov_b32 v40, v8 @@ -218741,12 +218739,14 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v5 :: v_dual_mov_b32 v36, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v3 :: v_dual_mov_b32 v34, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v1 :: v_dual_mov_b32 v32, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v48 :: v_dual_mov_b32 v17, v49 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v50 :: v_dual_mov_b32 v19, v51 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v52 :: v_dual_mov_b32 v21, v53 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v54 :: v_dual_mov_b32 v23, v55 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v56 :: v_dual_mov_b32 v25, v57 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v58 :: v_dual_mov_b32 v27, v59 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48 @@ -230914,6 +230914,7 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; SI-NEXT: v_mov_b32_e32 v39, v59 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -231202,7 +231203,6 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[18:19], v[40:41], 16 -; SI-NEXT: v_mov_b32_e32 v39, v59 ; SI-NEXT: v_mov_b32_e32 v40, v60 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v20 ; SI-NEXT: v_lshr_b64 v[20:21], v[39:40], 16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index da908bc280e6e..05e3580e06a49 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -11639,7 +11639,6 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll index 2b48cf0f41c88..5537d5705d4d8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll @@ -55,10 +55,10 @@ define amdgpu_kernel void @bitcast_i8ptr_v16i8ptr(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 -; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index df77e7de43bf6..c8d344ff4a579 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -7905,8 +7905,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_sub_u32 s4, s4, s6 ; GFX6-NEXT: s_subb_u32 s5, s5, s7 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -9119,8 +9119,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] ; GFX6-NEXT: s_sub_u32 s4, s4, s10 ; GFX6-NEXT: s_subb_u32 s5, s5, s10 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll index 2889f37a65d97..e8fee9204c883 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll @@ -54,9 +54,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr, ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; GISEL-GFX10-NEXT: s_mov_b32 s4, use@abs32@lo ; GISEL-GFX10-NEXT: s_mov_b32 s5, use@abs32@hi +; GISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; GISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL-GFX10-NEXT: s_mov_b32 s32, 0 ; GISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -86,9 +86,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr, ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, s1 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, s2 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v3, s3 -; DAGISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; DAGISEL-GFX10-NEXT: s_mov_b32 s5, use@abs32@hi ; DAGISEL-GFX10-NEXT: s_mov_b32 s4, use@abs32@lo +; DAGISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; DAGISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51] ; DAGISEL-GFX10-NEXT: s_mov_b32 s32, 0 ; DAGISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -219,9 +219,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v29, v37 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v30, v38 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v31, v39 -; GISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; GISEL-GFX10-NEXT: s_mov_b32 s24, use@abs32@lo ; GISEL-GFX10-NEXT: s_mov_b32 s25, use@abs32@hi +; GISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; GISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[24:25] ; GISEL-GFX10-NEXT: s_endpgm @@ -347,9 +347,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v29, v34 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v30, v33 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v31, v32 -; DAGISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; DAGISEL-GFX10-NEXT: s_mov_b32 s25, use@abs32@hi ; DAGISEL-GFX10-NEXT: s_mov_b32 s24, use@abs32@lo +; DAGISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; DAGISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51] ; DAGISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[24:25] ; DAGISEL-GFX10-NEXT: s_endpgm @@ -374,9 +374,9 @@ define amdgpu_cs_chain void @alloca_and_call() { ; GISEL-GFX10: ; %bb.0: ; %.entry ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; GISEL-GFX10-NEXT: s_mov_b32 s4, use@abs32@lo ; GISEL-GFX10-NEXT: s_mov_b32 s5, use@abs32@hi +; GISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; GISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL-GFX10-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -400,9 +400,9 @@ define amdgpu_cs_chain void @alloca_and_call() { ; DAGISEL-GFX10: ; %bb.0: ; %.entry ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v0, 42 -; DAGISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; DAGISEL-GFX10-NEXT: s_mov_b32 s5, use@abs32@hi ; DAGISEL-GFX10-NEXT: s_mov_b32 s4, use@abs32@lo +; DAGISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; DAGISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51] ; DAGISEL-GFX10-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -449,10 +449,10 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) { ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop ; GISEL-GFX10-NEXT: ;;#ASMEND -; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v3 -; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] +; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] ; GISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103] ; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] @@ -489,10 +489,10 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) { ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND -; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v3 -; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] +; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; DAGISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103] +; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] ; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] @@ -530,10 +530,10 @@ define amdgpu_cs void @cs_to_chain_nonuniform(<3 x i32> %a, <3 x i32> %b) { ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: s_bitset0_b32 s103, 21 ; GISEL-GFX10-NEXT: s_add_u32 s100, s100, s0 -; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; GISEL-GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] +; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; GISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103] +; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] ; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] ; @@ -564,10 +564,10 @@ define amdgpu_cs void @cs_to_chain_nonuniform(<3 x i32> %a, <3 x i32> %b) { ; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL-GFX10-NEXT: s_bitset0_b32 s103, 21 ; DAGISEL-GFX10-NEXT: s_add_u32 s100, s100, s0 -; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; DAGISEL-GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] +; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; DAGISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103] +; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] ; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0) @@ -930,15 +930,15 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) { ; GISEL-GFX11-LABEL: amdgpu_cs_chain_dont_realign_stack: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: s_mov_b32 s1, 2 +; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 4, v8 ; GISEL-GFX11-NEXT: s_mov_b32 s3, 4 ; GISEL-GFX11-NEXT: s_mov_b32 s2, 3 -; GISEL-GFX11-NEXT: s_mov_b32 s1, 2 ; GISEL-GFX11-NEXT: s_mov_b32 s0, 1 -; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v8 -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GISEL-GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GISEL-GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v3, s3 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc ; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GISEL-GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll index 36e2db0c4879d..a4cab10e972bf 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll @@ -60,10 +60,10 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) { ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop ; GISEL-GFX10-NEXT: ;;#ASMEND -; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v3 -; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] +; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] ; GISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103] ; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] @@ -100,10 +100,10 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) { ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND -; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v3 -; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] +; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; DAGISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103] +; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] ; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] @@ -583,15 +583,15 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stac ; GISEL-GFX11-LABEL: amdgpu_cs_chain_preserve_dont_realign_stack: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: s_mov_b32 s1, 2 +; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 4, v8 ; GISEL-GFX11-NEXT: s_mov_b32 s3, 4 ; GISEL-GFX11-NEXT: s_mov_b32 s2, 3 -; GISEL-GFX11-NEXT: s_mov_b32 s1, 2 ; GISEL-GFX11-NEXT: s_mov_b32 s0, 1 -; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v8 -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GISEL-GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GISEL-GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v3, s3 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc ; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GISEL-GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll index fe9ec8e6ef52a..25124c88d490c 100644 --- a/llvm/test/CodeGen/AMDGPU/and.ll +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -651,8 +651,8 @@ define amdgpu_kernel void @s_and_multi_use_constant_i64(ptr addrspace(1) %out, i ; GFX6-NEXT: s_mov_b32 s0, 0x80000 ; GFX6-NEXT: s_movk_i32 s1, 0x80 ; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -674,8 +674,8 @@ define amdgpu_kernel void @s_and_multi_use_constant_i64(ptr addrspace(1) %out, i ; GFX8-NEXT: s_mov_b32 s0, 0x80000 ; GFX8-NEXT: s_movk_i32 s1, 0x80 ; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -738,10 +738,10 @@ define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, ; GFX6-NEXT: s_and_b32 s8, s7, 62 ; GFX6-NEXT: s_add_u32 s6, s6, s4 ; GFX6-NEXT: s_addc_u32 s7, 0, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_add_u32 s4, s8, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_addc_u32 s5, 0, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -765,10 +765,10 @@ define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, ; GFX8-NEXT: s_and_b32 s8, s7, 62 ; GFX8-NEXT: s_add_u32 s6, s6, s4 ; GFX8-NEXT: s_addc_u32 s7, 0, s5 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s4, s8, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_addc_u32 s5, 0, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll index 61645200690f5..f2aed101f045b 100644 --- a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll @@ -65,82 +65,82 @@ define amdgpu_kernel void @any_extend_vector_inreg_v16i8_to_v4i32(ptr addrspace( ; GFX8-NEXT: s_add_u32 s4, s10, 13 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s10, 15 -; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s10, 14 -; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s10, 8 -; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s10, 11 -; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s10, 10 -; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s10, 4 -; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s10, 6 -; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s10, 1 -; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_add_u32 s4, s10, 3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NEXT: s_add_u32 s4, s10, 3 -; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s10, 9 -; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s4, s10, 2 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s5, s11, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NEXT: s_add_u32 s4, s10, 2 +; GFX8-NEXT: s_add_u32 s0, s10, 5 ; GFX8-NEXT: flat_store_byte v[0:1], v2 -; GFX8-NEXT: s_addc_u32 s5, s11, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: s_add_u32 s0, s10, 5 -; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_addc_u32 s1, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_add_u32 s0, s10, 12 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NEXT: s_add_u32 s0, s10, 12 -; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_addc_u32 s1, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_add_u32 s0, s10, 7 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: s_add_u32 s0, s10, 7 -; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_addc_u32 s1, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 12cb8d2f6fb51..f04aa036fffbf 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -26,6 +26,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-TRUE16,GFX1232_DPP,GFX1232_DPP-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-FAKE16,GFX1232_DPP,GFX1232_DPP-FAKE16 %s + declare i32 @llvm.amdgcn.workitem.id.x() ; Show what the atomic optimization pass will do for global pointers. @@ -2369,12 +2370,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 ; GFX8_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc ; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) @@ -2421,12 +2422,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 ; GFX9_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc ; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) @@ -2841,12 +2842,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s6 ; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s10, -1 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: s_mov_b32 s8, s2 ; GFX8_DPP-NEXT: s_mov_b32 s9, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s6 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s7 ; GFX8_DPP-NEXT: buffer_atomic_add_x2 v[6:7], off, s[8:11], 0 glc ; GFX8_DPP-NEXT: s_waitcnt vmcnt(0) @@ -2924,12 +2925,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s6 ; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s10, -1 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: s_mov_b32 s8, s2 ; GFX9_DPP-NEXT: s_mov_b32 s9, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s6 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s7 ; GFX9_DPP-NEXT: buffer_atomic_add_x2 v[6:7], off, s[8:11], 0 glc ; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) @@ -3717,8 +3718,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_mov_b32_e32 v4, v0 ; GFX1064-NEXT: v_subrev_nc_u32_e32 v3, s12, v4 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv @@ -3765,8 +3766,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_mov_b32_e32 v4, v0 ; GFX1032-NEXT: v_subrev_nc_u32_e32 v3, s10, v4 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv @@ -3816,9 +3817,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v4, v0 ; GFX1164-NEXT: v_subrev_nc_u32_e32 v3, s12, v4 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: v_mov_b32_e32 v1, v4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv @@ -3870,7 +3871,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: v_mov_b32_e32 v4, v0 ; GFX1132-NEXT: v_subrev_nc_u32_e32 v3, s10, v4 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX1132-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v0, v3 ; GFX1132-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv @@ -4148,8 +4149,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_mov_b32_e32 v4, v0 ; GFX1064-NEXT: v_subrev_nc_u32_e32 v3, s13, v4 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv @@ -4198,8 +4199,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_mov_b32_e32 v4, v0 ; GFX1032-NEXT: v_subrev_nc_u32_e32 v3, s11, v4 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv @@ -4251,9 +4252,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v4, v0 ; GFX1164-NEXT: v_subrev_nc_u32_e32 v3, s13, v4 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: v_mov_b32_e32 v1, v4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv @@ -4307,7 +4308,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: v_mov_b32_e32 v4, v0 ; GFX1132-NEXT: v_subrev_nc_u32_e32 v3, s11, v4 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX1132-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v0, v3 ; GFX1132-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv @@ -4630,8 +4631,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, v0 ; GFX1064_ITERATIVE-NEXT: v_subrev_nc_u32_e32 v3, s12, v4 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064_ITERATIVE-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv @@ -4688,8 +4689,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, v0 ; GFX1032_ITERATIVE-NEXT: v_subrev_nc_u32_e32 v3, s8, v4 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032_ITERATIVE-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv @@ -4752,9 +4753,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, v0 ; GFX1164_ITERATIVE-NEXT: v_subrev_nc_u32_e32 v3, s12, v4 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, v4 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164_ITERATIVE-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv @@ -4818,7 +4819,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, v0 ; GFX1132_ITERATIVE-NEXT: v_subrev_nc_u32_e32 v3, s8, v4 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v0, v3 ; GFX1132_ITERATIVE-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv @@ -5373,8 +5374,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX1132_DPP-NEXT: v_subrev_nc_u32_e32 v5, s9, v6 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v5 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v6 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX1132_DPP-NEXT: buffer_atomic_cmpswap_b32 v[4:5], off, s[4:7], 0 glc ; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl1_inv @@ -5617,8 +5617,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: s_mov_b32 s5, s3 ; GFX8-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s12, v7 ; GFX8-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v0, v5 @@ -5674,8 +5674,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: s_mov_b32 s5, s3 ; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s12, v7 ; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v8, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, v5 @@ -5731,14 +5731,14 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064-NEXT: s_mov_b32 s5, s3 ; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_mov_b32_e32 v8, v1 ; GFX1064-NEXT: v_mov_b32_e32 v7, v0 +; GFX1064-NEXT: v_mov_b32_e32 v8, v1 ; GFX1064-NEXT: v_sub_co_u32 v5, vcc, v7, s12 ; GFX1064-NEXT: v_subrev_co_ci_u32_e32 v6, vcc, 0, v8, vcc -; GFX1064-NEXT: v_mov_b32_e32 v0, v5 ; GFX1064-NEXT: v_mov_b32_e32 v2, v7 -; GFX1064-NEXT: v_mov_b32_e32 v1, v6 +; GFX1064-NEXT: v_mov_b32_e32 v0, v5 ; GFX1064-NEXT: v_mov_b32_e32 v3, v8 +; GFX1064-NEXT: v_mov_b32_e32 v1, v6 ; GFX1064-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv @@ -5787,14 +5787,14 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032-NEXT: s_mov_b32 s5, s3 ; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_mov_b32_e32 v8, v1 ; GFX1032-NEXT: v_mov_b32_e32 v7, v0 +; GFX1032-NEXT: v_mov_b32_e32 v8, v1 ; GFX1032-NEXT: v_sub_co_u32 v5, vcc_lo, v7, s10 ; GFX1032-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, 0, v8, vcc_lo -; GFX1032-NEXT: v_mov_b32_e32 v0, v5 ; GFX1032-NEXT: v_mov_b32_e32 v2, v7 -; GFX1032-NEXT: v_mov_b32_e32 v1, v6 +; GFX1032-NEXT: v_mov_b32_e32 v0, v5 ; GFX1032-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-NEXT: v_mov_b32_e32 v1, v6 ; GFX1032-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv @@ -5845,17 +5845,17 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_mov_b32 s5, s3 ; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v8, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mov_b32_e32 v7, v0 +; GFX1164-NEXT: v_mov_b32_e32 v8, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_sub_co_u32 v5, vcc, v7, s12 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_subrev_co_ci_u32_e64 v6, null, 0, v8, vcc -; GFX1164-NEXT: v_mov_b32_e32 v0, v5 ; GFX1164-NEXT: v_mov_b32_e32 v2, v7 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164-NEXT: v_mov_b32_e32 v1, v6 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1164-NEXT: v_mov_b32_e32 v0, v5 ; GFX1164-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-NEXT: v_mov_b32_e32 v1, v6 ; GFX1164-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv @@ -5910,13 +5910,11 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX1132-NEXT: v_dual_mov_b32 v3, v8 :: v_dual_mov_b32 v2, v7 ; GFX1132-NEXT: v_sub_co_u32 v5, vcc_lo, v7, s10 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_subrev_co_ci_u32_e64 v6, null, 0, v8, vcc_lo -; GFX1132-NEXT: v_mov_b32_e32 v0, v5 -; GFX1132-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1132-NEXT: v_mov_b32_e32 v1, v6 +; GFX1132-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 ; GFX1132-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv @@ -6124,8 +6122,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: s_mov_b32 s5, s3 ; GFX8-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v10, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_mov_b32_e32 v10, v1 ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, v9, v4 ; GFX8-NEXT: v_subb_u32_e32 v8, vcc, v10, v5, vcc ; GFX8-NEXT: v_mov_b32_e32 v0, v7 @@ -6175,10 +6173,10 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_mul_i32 s7, s9, s6 ; GFX9-NEXT: s_mul_hi_u32 s12, s8, s6 ; GFX9-NEXT: s_add_i32 s7, s12, s7 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_mul_i32 s14, s8, s6 ; GFX9-NEXT: s_mov_b64 s[12:13], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 @@ -6187,8 +6185,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_mov_b32 s5, s3 ; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s14, v8 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v9, v5, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, v6 @@ -6249,14 +6247,14 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_mov_b32 s5, s3 ; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_mov_b32_e32 v8, v1 ; GFX1064-NEXT: v_mov_b32_e32 v7, v0 +; GFX1064-NEXT: v_mov_b32_e32 v8, v1 ; GFX1064-NEXT: v_sub_co_u32 v5, vcc, v7, s14 ; GFX1064-NEXT: v_subrev_co_ci_u32_e32 v6, vcc, s15, v8, vcc -; GFX1064-NEXT: v_mov_b32_e32 v0, v5 ; GFX1064-NEXT: v_mov_b32_e32 v2, v7 -; GFX1064-NEXT: v_mov_b32_e32 v1, v6 +; GFX1064-NEXT: v_mov_b32_e32 v0, v5 ; GFX1064-NEXT: v_mov_b32_e32 v3, v8 +; GFX1064-NEXT: v_mov_b32_e32 v1, v6 ; GFX1064-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv @@ -6310,14 +6308,14 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: s_mov_b32 s5, s3 ; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_mov_b32_e32 v8, v1 ; GFX1032-NEXT: v_mov_b32_e32 v7, v0 +; GFX1032-NEXT: v_mov_b32_e32 v8, v1 ; GFX1032-NEXT: v_sub_co_u32 v5, vcc_lo, v7, s12 ; GFX1032-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s13, v8, vcc_lo -; GFX1032-NEXT: v_mov_b32_e32 v0, v5 ; GFX1032-NEXT: v_mov_b32_e32 v2, v7 -; GFX1032-NEXT: v_mov_b32_e32 v1, v6 +; GFX1032-NEXT: v_mov_b32_e32 v0, v5 ; GFX1032-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-NEXT: v_mov_b32_e32 v1, v6 ; GFX1032-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv @@ -6373,17 +6371,17 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_mov_b32 s5, s3 ; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v8, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mov_b32_e32 v7, v0 +; GFX1164-NEXT: v_mov_b32_e32 v8, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_sub_co_u32 v5, vcc, v7, s14 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_subrev_co_ci_u32_e64 v6, null, s15, v8, vcc -; GFX1164-NEXT: v_mov_b32_e32 v0, v5 ; GFX1164-NEXT: v_mov_b32_e32 v2, v7 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164-NEXT: v_mov_b32_e32 v1, v6 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1164-NEXT: v_mov_b32_e32 v0, v5 ; GFX1164-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-NEXT: v_mov_b32_e32 v1, v6 ; GFX1164-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv @@ -6444,13 +6442,11 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX1132-NEXT: v_dual_mov_b32 v3, v8 :: v_dual_mov_b32 v2, v7 ; GFX1132-NEXT: v_sub_co_u32 v5, vcc_lo, v7, s12 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_subrev_co_ci_u32_e64 v6, null, s13, v8, vcc_lo -; GFX1132-NEXT: v_mov_b32_e32 v0, v5 -; GFX1132-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1132-NEXT: v_mov_b32_e32 v1, v6 +; GFX1132-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 ; GFX1132-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv @@ -6687,8 +6683,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: s_mov_b32 s5, s3 ; GFX8_ITERATIVE-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v10, v1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v9, v0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v10, v1 ; GFX8_ITERATIVE-NEXT: v_subrev_u32_e32 v7, vcc, s8, v9 ; GFX8_ITERATIVE-NEXT: v_subb_u32_e32 v8, vcc, v10, v6, vcc ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, v7 @@ -6759,8 +6755,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: s_mov_b32 s5, s3 ; GFX9_ITERATIVE-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v10, v1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v9, v0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v10, v1 ; GFX9_ITERATIVE-NEXT: v_subrev_co_u32_e32 v7, vcc, s8, v9 ; GFX9_ITERATIVE-NEXT: v_subb_co_u32_e32 v8, vcc, v10, v6, vcc ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, v7 @@ -6829,14 +6825,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s5, s3 ; GFX1064_ITERATIVE-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v9, v1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v8, v0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v9, v1 ; GFX1064_ITERATIVE-NEXT: v_sub_co_u32 v6, vcc, v8, s8 ; GFX1064_ITERATIVE-NEXT: v_subrev_co_ci_u32_e32 v7, vcc, s9, v9, vcc -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, v6 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, v8 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, v6 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7 ; GFX1064_ITERATIVE-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv @@ -6898,14 +6894,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s5, s3 ; GFX1032_ITERATIVE-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v9, v1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v8, v0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v9, v1 ; GFX1032_ITERATIVE-NEXT: v_sub_co_u32 v6, vcc_lo, v8, s8 ; GFX1032_ITERATIVE-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s9, v9, vcc_lo -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, v6 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, v8 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, v6 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7 ; GFX1032_ITERATIVE-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv @@ -6973,17 +6969,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s5, s3 ; GFX1164_ITERATIVE-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v8, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v6, vcc, v8, s8 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_ITERATIVE-NEXT: v_subrev_co_ci_u32_e64 v7, null, s9, v9, vcc -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, v6 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, v8 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, v6 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, v9 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7 ; GFX1164_ITERATIVE-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc ; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv @@ -7053,13 +7049,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v2, v8 ; GFX1132_ITERATIVE-NEXT: v_sub_co_u32 v6, vcc_lo, v8, s8 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_subrev_co_ci_u32_e64 v7, null, s9, v9, vcc_lo -; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, v6 -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 ; GFX1132_ITERATIVE-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc ; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv @@ -7301,8 +7295,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s11 ; GFX8_DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX8_DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v10, v6 +; GFX8_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX8_DPP-NEXT: v_subrev_u32_e32 v8, vcc, s10, v10 ; GFX8_DPP-NEXT: v_subb_u32_e32 v9, vcc, v11, v0, vcc ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v8 @@ -7404,8 +7398,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s11 ; GFX9_DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX9_DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v10, v6 +; GFX9_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX9_DPP-NEXT: v_subrev_co_u32_e32 v8, vcc, s10, v10 ; GFX9_DPP-NEXT: v_subb_co_u32_e32 v9, vcc, v11, v0, vcc ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v8 @@ -7526,13 +7520,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 ; GFX1064_DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1064_DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064_DPP-NEXT: v_sub_co_u32 v10, vcc, v12, s8 ; GFX1064_DPP-NEXT: v_subrev_co_ci_u32_e32 v11, vcc, s9, v13, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v10 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v12 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v13 ; GFX1064_DPP-NEXT: buffer_atomic_cmpswap_x2 v[8:11], off, s[4:7], 0 glc ; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) @@ -7629,13 +7623,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 ; GFX1032_DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032_DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v14, v10 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v14, v10 ; GFX1032_DPP-NEXT: v_sub_co_u32 v11, vcc_lo, v13, s8 ; GFX1032_DPP-NEXT: v_subrev_co_ci_u32_e32 v12, vcc_lo, s9, v14, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v11 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v12 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v13 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v12 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v14 ; GFX1032_DPP-NEXT: buffer_atomic_cmpswap_x2 v[9:12], off, s[4:7], 0 glc ; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) @@ -7756,17 +7750,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_mov_b32 s5, s3 ; GFX1164_DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1164_DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v7 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v6 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1164_DPP-NEXT: v_sub_co_u32 v8, vcc, v10, s8 ; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0) -; GFX1164_DPP-NEXT: v_subrev_co_ci_u32_e64 v9, null, s9, v11, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_subrev_co_ci_u32_e64 v9, null, s9, v11, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v8 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v9 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v10 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v9 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v11 ; GFX1164_DPP-NEXT: buffer_atomic_cmpswap_b64 v[6:9], off, s[4:7], 0 glc ; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) @@ -7875,7 +7869,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_subrev_co_ci_u32_e64 v11, null, s9, v13, vcc_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v10 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, v11 :: v_dual_mov_b32 v10, v12 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, v12 :: v_dual_mov_b32 v9, v11 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v13 ; GFX1132_DPP-NEXT: buffer_atomic_cmpswap_b64 v[8:11], off, s[4:7], 0 glc ; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) @@ -9042,9 +9036,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_and_b32_e32 v0, s12, v0 ; GFX1064-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -9099,9 +9093,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1032-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -9157,13 +9151,13 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0 ; GFX1164-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0) ; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -9222,13 +9216,13 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0 ; GFX1164-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0) ; GFX1164-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -9287,11 +9281,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1132-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v0, s12, v1 ; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -9349,11 +9343,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1132-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v0, s12, v1 ; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -9412,12 +9406,12 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -9478,12 +9472,12 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -9546,11 +9540,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1232-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v0, s12, v1 ; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -9612,11 +9606,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1232-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v0, s12, v1 ; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -10943,9 +10937,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_and_b32_e32 v0, s12, v0 ; GFX1064-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -11000,9 +10994,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1032-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -11058,13 +11052,13 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0 ; GFX1164-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0) ; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -11122,13 +11116,13 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0 ; GFX1164-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0) ; GFX1164-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -11187,11 +11181,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1132-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v0, s12, v1 ; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -11248,11 +11242,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1132-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v0, s12, v1 ; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -11311,12 +11305,12 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -11376,12 +11370,12 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -11444,11 +11438,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v0, s12, v1 ; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -11509,11 +11503,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v0, s12, v1 ; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -12065,10 +12059,10 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1064-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s9, v1 +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_add_f16_e32 v0, s8, v0 ; GFX1064-NEXT: v_lshlrev_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX1064-NEXT: v_and_or_b32 v0, v1, s10, v0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -12106,10 +12100,10 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1032-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v1 +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_add_f16_e32 v0, s8, v0 ; GFX1032-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX1032-NEXT: v_and_or_b32 v0, v1, s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -12147,15 +12141,15 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 ; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0 ; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s10, v0 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -12194,15 +12188,15 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_and_or_b32 v0, v1, s10, v0 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -12241,15 +12235,16 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1132-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 ; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1132-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s3, v0 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -12287,15 +12282,16 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1132-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1132-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1132-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_and_or_b32 v0, v1, s3, v0 -; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -12333,15 +12329,15 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 ; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0 ; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s10, v0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -12380,15 +12376,15 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0 ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1264-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_and_or_b32 v0, v1, s10, v0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -12427,15 +12423,16 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1232-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 ; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1232-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s3, v0 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -12473,15 +12470,16 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1232-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1232-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_and_or_b32 v0, v1, s3, v0 -; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -12674,9 +12672,9 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1064-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX1064-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1064-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX1064-NEXT: v_and_or_b32 v0, v1, s9, v0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -12721,9 +12719,9 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1032-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX1032-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1032-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX1032-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -12774,12 +12772,12 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1164-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h ; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s8, v2 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s9, v0 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -12831,12 +12829,12 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s8, v0 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_and_or_b32 v0, v1, s9, v0 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -12889,12 +12887,13 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1132-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1132-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h ; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v2 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -12943,13 +12942,13 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1132-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX1132-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX1132-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v3 :: v_dual_mov_b32 v3, v1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX1132-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -13000,12 +12999,12 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1264-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX1264-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h ; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s8, v2 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s9, v0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -13056,12 +13055,12 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1264-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1264-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX1264-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX1264-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s8, v0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_and_or_b32 v0, v1, s9, v0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -13113,12 +13112,13 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1232-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX1232-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h ; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v2 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -13167,13 +13167,13 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1232-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1232-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX1232-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v3 :: v_dual_mov_b32 v3, v1 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -13720,8 +13720,8 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1064-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 ; GFX1064-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc ; GFX1064-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1] -; GFX1064-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -13769,8 +13769,8 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1032-NEXT: v_cmp_u_f32_e64 s0, v0, v0 ; GFX1032-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e64 v0, v3, v5, s0 -; GFX1032-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -13885,9 +13885,9 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc ; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1] -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -14001,9 +14001,10 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo ; GFX1132-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s0 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 -; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -14115,9 +14116,9 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1264-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc ; GFX1264-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1264-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1] -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -14228,9 +14229,10 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1232-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo ; GFX1232-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s0 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 -; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 785aee07a990e..a013c7e7ab7b7 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -2019,8 +2019,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] @@ -2067,8 +2067,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -2113,8 +2113,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -2159,8 +2159,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -2211,8 +2211,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1164_ITERATIVE-NEXT: ds_add_rtn_u64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -2877,8 +2877,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] @@ -2909,8 +2909,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 ; GFX9_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -2940,8 +2940,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 ; GFX1064_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -2971,8 +2971,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -3007,8 +3007,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 ; GFX1164_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -3041,8 +3041,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1132_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv @@ -5433,8 +5433,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] @@ -5481,8 +5481,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -5527,8 +5527,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -5573,8 +5573,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -5625,8 +5625,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1164_ITERATIVE-NEXT: ds_sub_rtn_u64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -6907,8 +6907,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] @@ -6953,8 +6953,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -6997,8 +6997,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -7042,8 +7042,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -7092,8 +7092,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1164_ITERATIVE-NEXT: ds_and_rtn_b64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -8255,8 +8255,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] @@ -8301,8 +8301,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -8345,8 +8345,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -8390,8 +8390,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -8440,8 +8440,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1164_ITERATIVE-NEXT: ds_or_rtn_b64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -9603,8 +9603,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] @@ -9649,8 +9649,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -9693,8 +9693,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -9738,8 +9738,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -9788,8 +9788,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1164_ITERATIVE-NEXT: ds_xor_rtn_b64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -11202,8 +11202,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] @@ -11257,8 +11257,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -11308,8 +11308,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -11358,8 +11358,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -11415,8 +11415,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1164_ITERATIVE-NEXT: ds_max_rtn_i64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -13027,8 +13027,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] @@ -13082,8 +13082,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -13133,8 +13133,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -13183,8 +13183,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -13240,8 +13240,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1164_ITERATIVE-NEXT: ds_min_rtn_i64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -14846,8 +14846,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] @@ -14900,8 +14900,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -14950,8 +14950,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -14999,8 +14999,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -15055,8 +15055,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1164_ITERATIVE-NEXT: ds_max_rtn_u64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -16657,8 +16657,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] @@ -16711,8 +16711,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -16761,8 +16761,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -16810,8 +16810,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -16866,8 +16866,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1164_ITERATIVE-NEXT: ds_min_rtn_u64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll index ce8ffab77ac85..8b1dc1e4bc193 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll @@ -116,8 +116,8 @@ define amdgpu_kernel void @flat_atomic_usub_cond_rtn_u32(ptr %addr, i32 %in, ptr ; GFX9-SDAG-NEXT: v_subrev_u32_e32 v0, s6, v1 ; GFX9-SDAG-NEXT: s_addc_u32 s9, s1, 0 ; GFX9-SDAG-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -172,8 +172,8 @@ define amdgpu_kernel void @flat_atomic_usub_cond_rtn_u32(ptr %addr, i32 %in, ptr ; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, s6, v1 ; GFX9-GISEL-NEXT: s_addc_u32 s9, s1, 0 ; GFX9-GISEL-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll index 4af2d58b01518..043b6ffbc4018 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll @@ -20,8 +20,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: s_endpgm entry: @@ -44,8 +44,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 ; GFX12-GISEL-NEXT: s_endpgm entry: @@ -75,8 +75,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll index 4bc6220b4d9a0..3ad770a95ad2a 100644 --- a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll @@ -108,14 +108,14 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_cbranch_vccz .LBB0_10 ; CHECK-NEXT: ; %bb.9: ; %.preheader1856.preheader.i.i.i3325 ; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=2 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v28 ; CHECK-NEXT: s_mov_b64 s[24:25], 0 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v28 ; CHECK-NEXT: v_accvgpr_write_b32 a1, v29 ; CHECK-NEXT: s_mov_b64 s[8:9], 0 ; CHECK-NEXT: s_branch .LBB0_6 ; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_7 Depth=2 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v30 ; CHECK-NEXT: s_mov_b64 s[24:25], -1 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v30 ; CHECK-NEXT: v_accvgpr_write_b32 a1, v31 ; CHECK-NEXT: s_mov_b64 s[8:9], s[18:19] ; CHECK-NEXT: s_branch .LBB0_6 diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll index e33b9ab0eda9e..c4216f5cb4d84 100644 --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -54,8 +54,8 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: s_lshr_b32 s2, s2, 16 -; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: flat_store_short v[0:1], v2 ; GISEL-NEXT: s_endpgm @@ -150,11 +150,11 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GISEL-NEXT: flat_load_ushort v0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 ; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: flat_store_short v[0:1], v2 ; GISEL-NEXT: s_endpgm ; @@ -233,8 +233,8 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b32 s2, s2 -; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: flat_store_dword v[0:1], v2 ; GISEL-NEXT: s_endpgm @@ -316,10 +316,10 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: flat_load_dword v0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v2, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: flat_store_dword v[0:1], v2 ; GISEL-NEXT: s_endpgm ; @@ -394,8 +394,8 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: s_brev_b32 s3, s3 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-NEXT: s_endpgm @@ -558,8 +558,8 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] ; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-NEXT: s_endpgm @@ -725,10 +725,10 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; GISEL-NEXT: s_brev_b64 s[0:1], s[0:1] ; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] ; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GISEL-NEXT: v_mov_b32_e32 v5, s5 ; GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GISEL-NEXT: s_endpgm @@ -821,9 +821,9 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v4, v1 ; GISEL-NEXT: v_bfrev_b32_e32 v5, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_bfrev_b32_e32 v6, v3 ; GISEL-NEXT: v_bfrev_b32_e32 v7, v2 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll index ad0d6d8016ad6..782a8507a9472 100644 --- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll +++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll @@ -29,11 +29,11 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: s_mov_b32 s18, 0 ; CHECK-NEXT: s_branch .LBB0_6 ; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: s_mov_b32 s13, s12 ; CHECK-NEXT: s_mov_b32 s14, s12 ; CHECK-NEXT: s_mov_b32 s15, s12 -; CHECK-NEXT: s_mov_b32 s13, s12 -; CHECK-NEXT: s_mov_b64 s[50:51], s[14:15] ; CHECK-NEXT: s_mov_b64 s[48:49], s[12:13] +; CHECK-NEXT: s_mov_b64 s[50:51], s[14:15] ; CHECK-NEXT: s_branch .LBB0_8 ; CHECK-NEXT: .LBB0_5: ; %if.then263.i.i ; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s53, 0 @@ -80,8 +80,8 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CHECK-NEXT: .LBB0_9: ; %kernel_direct_lighting.exit ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x20 -; CHECK-NEXT: v_mov_b32_e32 v0, s48 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, s48 ; CHECK-NEXT: v_mov_b32_e32 v1, s49 ; CHECK-NEXT: v_mov_b32_e32 v2, s50 ; CHECK-NEXT: v_mov_b32_e32 v3, s51 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index efb75e95212b2..9d828ad997f3c 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s + ; -------------------------------------------------------------------- ; float ; -------------------------------------------------------------------- @@ -66,8 +67,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -569,8 +570,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: ; Child Loop BB2_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v7, v8, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -625,8 +626,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX8-NEXT: ; Child Loop BB2_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v7, v8, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -680,8 +681,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX7-NEXT: ; Child Loop BB2_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v7, v8, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v7 ; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_mov_b32_e32 v6, v7 ; GFX7-NEXT: v_mov_b32_e32 v7, v8 ; GFX7-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -737,8 +738,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX6-NEXT: ; Child Loop BB2_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_add_f32_e32 v7, v8, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v7 ; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mov_b32_e32 v6, v7 ; GFX6-NEXT: v_mov_b32_e32 v7, v8 ; GFX6-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 @@ -824,8 +825,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -1208,7 +1209,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1236,8 +1237,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -1426,7 +1427,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1454,8 +1455,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -1644,7 +1645,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1672,8 +1673,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -1843,10 +1844,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1883,10 +1884,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1914,11 +1915,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: v_mov_b32_e32 v1, v7 -; GFX10-NEXT: v_mov_b32_e32 v2, v8 -; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -1955,10 +1956,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 -; GFX908-NEXT: v_mov_b32_e32 v1, v7 ; GFX908-NEXT: v_mov_b32_e32 v2, v8 ; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1985,10 +1986,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-NEXT: v_mov_b32_e32 v1, v7 ; GFX8-NEXT: v_mov_b32_e32 v2, v8 ; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -2015,9 +2016,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v3, v9 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2047,9 +2048,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v0, v6 ; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v3, v9 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2336,11 +2337,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: ; Child Loop BB10_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[11:12], v[13:14], v[5:6] +; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v9 @@ -2434,11 +2435,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: ; Child Loop BB10_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] +; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v9 @@ -2499,12 +2500,12 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX10-NEXT: ; Child Loop BB10_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] +; GFX10-NEXT: v_mov_b32_e32 v2, v13 +; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; GFX10-NEXT: v_mov_b32_e32 v1, v12 -; GFX10-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; GFX10-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v9 @@ -2595,10 +2596,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v0, v11 -; GFX908-NEXT: v_mov_b32_e32 v1, v12 ; GFX908-NEXT: v_mov_b32_e32 v2, v13 ; GFX908-NEXT: v_mov_b32_e32 v3, v14 +; GFX908-NEXT: v_mov_b32_e32 v0, v11 +; GFX908-NEXT: v_mov_b32_e32 v1, v12 ; GFX908-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v9 @@ -2657,10 +2658,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v0, v11 -; GFX8-NEXT: v_mov_b32_e32 v1, v12 ; GFX8-NEXT: v_mov_b32_e32 v2, v13 ; GFX8-NEXT: v_mov_b32_e32 v3, v14 +; GFX8-NEXT: v_mov_b32_e32 v0, v11 +; GFX8-NEXT: v_mov_b32_e32 v1, v12 ; GFX8-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v9 @@ -2836,10 +2837,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -2876,10 +2877,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -2907,11 +2908,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: v_mov_b32_e32 v1, v7 -; GFX10-NEXT: v_mov_b32_e32 v2, v8 -; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -2966,10 +2967,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 -; GFX908-NEXT: v_mov_b32_e32 v1, v7 ; GFX908-NEXT: v_mov_b32_e32 v2, v8 ; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2996,10 +2997,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-NEXT: v_mov_b32_e32 v1, v7 ; GFX8-NEXT: v_mov_b32_e32 v2, v8 ; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3026,9 +3027,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v3, v9 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3058,9 +3059,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v0, v6 ; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v3, v9 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3097,10 +3098,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -3137,10 +3138,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -3168,11 +3169,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: v_mov_b32_e32 v1, v7 -; GFX10-NEXT: v_mov_b32_e32 v2, v8 -; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -3209,10 +3210,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 -; GFX908-NEXT: v_mov_b32_e32 v1, v7 ; GFX908-NEXT: v_mov_b32_e32 v2, v8 ; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3239,10 +3240,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-NEXT: v_mov_b32_e32 v1, v7 ; GFX8-NEXT: v_mov_b32_e32 v2, v8 ; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3269,9 +3270,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v3, v9 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3301,9 +3302,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v0, v6 ; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v3, v9 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3354,13 +3355,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV @@ -3401,7 +3403,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -3409,7 +3412,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -3479,12 +3482,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -3519,14 +3523,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -3558,11 +3563,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3700,11 +3705,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3741,11 +3746,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3992,11 +3997,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v4, v1 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4130,11 +4135,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4169,11 +4174,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4236,12 +4241,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v9, v7 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -4317,14 +4323,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 ; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v9, v7 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_add_f16_e32 v6, v6, v5 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -4463,12 +4470,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v7 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -4538,14 +4546,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_f16_e32 v6, v6, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -4609,12 +4618,12 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f16_e32 v6, v6, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 @@ -4747,8 +4756,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: v_add_f16_e32 v6, v6, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX908-NEXT: v_mov_b32_e32 v9, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v9, v7 ; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -4814,8 +4823,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX8-NEXT: v_and_b32_e32 v8, v7, v11 ; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 -; GFX8-NEXT: v_mov_b32_e32 v9, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v9, v7 ; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -5033,14 +5042,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v1, v4, v6 :: v_dual_mov_b32 v4, v2 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV @@ -5092,14 +5101,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v3 :: v_dual_mov_b32 v3, v1 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -5189,13 +5198,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v4, v6 :: v_dual_mov_b32 v4, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -5241,13 +5250,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v3 :: v_dual_mov_b32 v3, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -5287,9 +5296,9 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5612,14 +5621,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v4, v5 :: v_dual_mov_b32 v5, v1 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -5758,13 +5767,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v4, v5 :: v_dual_mov_b32 v5, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -5803,9 +5812,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6077,14 +6086,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v6, v9, v12 :: v_dual_mov_b32 v9, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -6177,8 +6186,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX12-FAKE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -6335,14 +6343,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v9, v12 :: v_dual_mov_b32 v9, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -6430,8 +6438,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX11-FAKE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -6654,8 +6661,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc ; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -6728,8 +6735,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -6797,8 +6804,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -6867,8 +6874,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 @@ -6948,7 +6955,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -6976,8 +6983,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -7080,9 +7087,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7127,9 +7134,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7468,8 +7475,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v6, v7 -; GFX11-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-NEXT: v_dual_mov_b32 v6, v7 :: v_dual_mov_b32 v7, v8 ; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -7613,8 +7619,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v7, v8, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7671,8 +7677,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_add_f16_sdwa v6, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v7, v8, v5 ; GFX8-NEXT: v_or_b32_e32 v7, v7, v6 -; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7743,9 +7749,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_add_f32_e32 v8, v8, v11 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-NEXT: v_mov_b32_e32 v9, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX7-NEXT: v_mov_b32_e32 v9, v6 ; GFX7-NEXT: v_mov_b32_e32 v8, v5 ; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7902,7 +7908,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -7930,8 +7936,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -8050,9 +8056,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8097,9 +8103,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8414,7 +8420,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -8442,8 +8448,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -8562,9 +8568,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8609,9 +8615,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8983,10 +8989,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -9037,7 +9044,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -9082,8 +9089,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -9452,9 +9459,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -9497,8 +9505,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -9892,8 +9900,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX11-TRUE16-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -9976,8 +9983,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v5, 0x7060302 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX11-FAKE16-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -10203,8 +10209,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX908-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc ; GFX908-NEXT: v_perm_b32 v6, v6, v5, s15 -; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -10278,8 +10284,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v5, 16 -; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -10343,13 +10349,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: v_add_f32_e32 v8, v8, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_alignbit_b32 v5, v8, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v9, v6 +; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 ; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_mov_b32_e32 v9, v6 ; GFX7-NEXT: v_mov_b32_e32 v8, v5 ; GFX7-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -10422,8 +10428,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v7 ; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 @@ -10553,10 +10559,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -10607,7 +10614,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -10652,8 +10659,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -11022,9 +11029,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -11067,8 +11075,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -11394,10 +11402,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -11448,7 +11457,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -11493,8 +11502,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -11863,9 +11872,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -11908,8 +11918,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -12277,9 +12287,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -12322,8 +12333,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -12612,8 +12623,8 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index f3bf8c664f7a6..bbb7779a36ad8 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s + ; -------------------------------------------------------------------- ; float ; -------------------------------------------------------------------- @@ -593,8 +594,8 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v5, v7, v7 ; GFX908-NEXT: v_max_f32_e32 v6, v5, v8 -; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -651,8 +652,8 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v7 ; GFX8-NEXT: v_max_f32_e32 v6, v5, v8 -; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -798,9 +799,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX11-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_max_f32 v4, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -829,9 +830,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -1186,10 +1187,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1229,10 +1228,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1367,10 +1364,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1408,10 +1405,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1459,9 +1456,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v10, v3 ; GFX908-NEXT: v_mov_b32_e32 v9, v2 +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v8, v1 ; GFX908-NEXT: v_mov_b32_e32 v7, v0 ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc @@ -1489,9 +1486,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v10, v3 ; GFX8-NEXT: v_mov_b32_e32 v9, v2 +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc @@ -1567,12 +1564,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[5:6] ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v9 @@ -1667,12 +1664,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[11:12], v[0:1], v[5:6] ; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v9 @@ -1794,11 +1791,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v2, v13 +; GFX908-NEXT: v_mov_b32_e32 v3, v14 ; GFX908-NEXT: v_max_f64 v[11:12], v[0:1], v[5:6] ; GFX908-NEXT: v_mov_b32_e32 v0, v11 ; GFX908-NEXT: v_mov_b32_e32 v1, v12 -; GFX908-NEXT: v_mov_b32_e32 v2, v13 -; GFX908-NEXT: v_mov_b32_e32 v3, v14 ; GFX908-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v9 @@ -1858,11 +1855,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v2, v13 +; GFX8-NEXT: v_mov_b32_e32 v3, v14 ; GFX8-NEXT: v_max_f64 v[11:12], v[0:1], v[5:6] ; GFX8-NEXT: v_mov_b32_e32 v0, v11 ; GFX8-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NEXT: v_mov_b32_e32 v3, v14 ; GFX8-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v9 @@ -1970,10 +1967,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -2013,10 +2008,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -2236,10 +2229,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -2279,10 +2270,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -2432,7 +2421,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -2440,7 +2430,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV @@ -2482,7 +2472,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2491,7 +2482,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -2564,14 +2555,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -2608,7 +2600,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2616,7 +2609,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -2649,12 +2642,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2798,11 +2791,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2839,11 +2832,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2941,7 +2934,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2950,7 +2944,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -3064,7 +3058,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3072,7 +3067,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -3104,12 +3099,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3249,11 +3244,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3288,11 +3283,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3356,14 +3351,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v4.h, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v4.h, v4.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v5 ; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3448,8 +3443,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX12-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3591,14 +3585,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v4.h, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v4.h, v4.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v5 ; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3677,8 +3671,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX11-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3886,8 +3879,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: v_max_f16_e32 v4, v4, v10 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3955,8 +3948,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -4174,14 +4167,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v1, v4, v6 :: v_dual_mov_b32 v4, v2 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV @@ -4233,14 +4226,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v3 :: v_dual_mov_b32 v3, v1 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -4330,13 +4323,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v4, v6 :: v_dual_mov_b32 v4, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -4382,13 +4375,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v3 :: v_dual_mov_b32 v3, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -4428,9 +4421,9 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4755,14 +4748,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v4, v5 :: v_dual_mov_b32 v5, v1 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -4901,13 +4894,13 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v4, v5 :: v_dual_mov_b32 v5, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -4946,9 +4939,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5222,14 +5215,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v6, v9, v12 :: v_dual_mov_b32 v9, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5322,8 +5315,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX12-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5480,14 +5472,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v9, v12 :: v_dual_mov_b32 v9, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5575,8 +5567,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX11-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5799,8 +5790,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc ; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -5873,8 +5864,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -5943,8 +5934,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -6014,8 +6005,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 @@ -6135,11 +6126,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -6168,9 +6160,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -6297,9 +6289,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6344,9 +6336,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6472,9 +6464,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6708,8 +6700,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v6, v5, v8 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6834,8 +6825,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v6, v5, v8 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -7014,8 +7004,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v5, v7, v7 ; GFX908-NEXT: v_pk_max_f16 v6, v5, v8 -; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7076,8 +7066,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_max_f16_sdwa v5, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v6, v6, v9 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v5 -; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7148,9 +7138,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_max_f32_e32 v8, v8, v11 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-NEXT: v_mov_b32_e32 v9, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX7-NEXT: v_mov_b32_e32 v9, v6 ; GFX7-NEXT: v_mov_b32_e32 v8, v5 ; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7308,11 +7298,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v6 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV @@ -7366,7 +7357,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX12-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -7457,10 +7448,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -7511,7 +7503,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -7556,8 +7548,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -7875,9 +7867,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -8011,9 +8004,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -8056,8 +8050,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8346,8 +8340,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX12-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8436,8 +8429,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX12-FAKE16-NEXT: v_perm_b32 v6, v6, v5, 0x7060302 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX12-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8596,8 +8588,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX11-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8680,8 +8671,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v5, 0x7060302 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX11-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8907,8 +8897,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX908-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc ; GFX908-NEXT: v_perm_b32 v6, v6, v5, s15 -; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -8982,8 +8972,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v5, 16 -; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -9047,13 +9037,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v10 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v11 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; GFX7-NEXT: v_mov_b32_e32 v9, v6 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 ; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_mov_b32_e32 v9, v6 ; GFX7-NEXT: v_mov_b32_e32 v8, v5 ; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -9121,13 +9111,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7 ; GFX6-NEXT: v_max_f32_e32 v4, v4, v9 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v10 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16 ; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index d1dc76f321375..d3e43f6945b29 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s + ; -------------------------------------------------------------------- ; float ; -------------------------------------------------------------------- @@ -593,8 +594,8 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v5, v7, v7 ; GFX908-NEXT: v_min_f32_e32 v6, v5, v8 -; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -651,8 +652,8 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v7 ; GFX8-NEXT: v_min_f32_e32 v6, v5, v8 -; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -798,9 +799,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX11-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_min_f32 v4, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -829,9 +830,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -1186,10 +1187,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1229,10 +1228,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1367,10 +1364,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1408,10 +1405,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1459,9 +1456,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v10, v3 ; GFX908-NEXT: v_mov_b32_e32 v9, v2 +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v8, v1 ; GFX908-NEXT: v_mov_b32_e32 v7, v0 ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc @@ -1489,9 +1486,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v10, v3 ; GFX8-NEXT: v_mov_b32_e32 v9, v2 +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc @@ -1567,12 +1564,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[5:6] ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v9 @@ -1667,12 +1664,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[11:12], v[0:1], v[5:6] ; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v9 @@ -1794,11 +1791,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v2, v13 +; GFX908-NEXT: v_mov_b32_e32 v3, v14 ; GFX908-NEXT: v_min_f64 v[11:12], v[0:1], v[5:6] ; GFX908-NEXT: v_mov_b32_e32 v0, v11 ; GFX908-NEXT: v_mov_b32_e32 v1, v12 -; GFX908-NEXT: v_mov_b32_e32 v2, v13 -; GFX908-NEXT: v_mov_b32_e32 v3, v14 ; GFX908-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v9 @@ -1858,11 +1855,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v2, v13 +; GFX8-NEXT: v_mov_b32_e32 v3, v14 ; GFX8-NEXT: v_min_f64 v[11:12], v[0:1], v[5:6] ; GFX8-NEXT: v_mov_b32_e32 v0, v11 ; GFX8-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NEXT: v_mov_b32_e32 v3, v14 ; GFX8-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v9 @@ -1970,10 +1967,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -2013,10 +2008,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -2236,10 +2229,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -2279,10 +2270,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -2432,7 +2421,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -2440,7 +2430,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV @@ -2482,7 +2472,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 ; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2491,7 +2482,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -2564,14 +2555,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -2608,7 +2600,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2616,7 +2609,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -2649,12 +2642,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2798,11 +2791,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2839,11 +2832,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2941,7 +2934,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 ; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2950,7 +2944,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -3064,7 +3058,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3072,7 +3067,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -3104,12 +3099,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3249,11 +3244,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3288,11 +3283,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3356,14 +3351,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v4.h, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v4.h, v4.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v5 ; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3448,8 +3443,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX12-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3591,14 +3585,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v4.h, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v4.h, v4.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v5 ; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3677,8 +3671,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX11-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3886,8 +3879,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: v_min_f16_e32 v4, v4, v10 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3955,8 +3948,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -4174,14 +4167,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v1, v4, v6 :: v_dual_mov_b32 v4, v2 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV @@ -4233,14 +4226,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v3 :: v_dual_mov_b32 v3, v1 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -4330,13 +4323,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v4, v6 :: v_dual_mov_b32 v4, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -4382,13 +4375,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v3 :: v_dual_mov_b32 v3, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -4428,9 +4421,9 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4755,14 +4748,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v4, v5 :: v_dual_mov_b32 v5, v1 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -4901,13 +4894,13 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v4, v5 :: v_dual_mov_b32 v5, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -4946,9 +4939,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5222,14 +5215,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v6, v9, v12 :: v_dual_mov_b32 v9, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5322,8 +5315,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX12-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5480,14 +5472,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v9, v12 :: v_dual_mov_b32 v9, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5575,8 +5567,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX11-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5799,8 +5790,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc ; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -5873,8 +5864,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -5943,8 +5934,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -6014,8 +6005,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 @@ -6135,11 +6126,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -6168,9 +6160,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -6297,9 +6289,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6344,9 +6336,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6472,9 +6464,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6708,8 +6700,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v6, v5, v8 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6834,8 +6825,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_pk_min_f16 v6, v5, v8 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -7014,8 +7004,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v5, v7, v7 ; GFX908-NEXT: v_pk_min_f16 v6, v5, v8 -; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7076,8 +7066,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_min_f16_sdwa v5, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v6, v6, v9 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v5 -; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7148,9 +7138,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_min_f32_e32 v8, v8, v11 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-NEXT: v_mov_b32_e32 v9, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX7-NEXT: v_mov_b32_e32 v9, v6 ; GFX7-NEXT: v_mov_b32_e32 v8, v5 ; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7308,11 +7298,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v6 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV @@ -7366,7 +7357,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX12-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -7457,10 +7448,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -7511,7 +7503,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -7556,8 +7548,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -7875,9 +7867,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -8011,9 +8004,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -8056,8 +8050,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8346,8 +8340,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX12-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8436,8 +8429,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX12-FAKE16-NEXT: v_perm_b32 v6, v6, v5, 0x7060302 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX12-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8596,8 +8588,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX11-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8680,8 +8671,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v5, 0x7060302 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX11-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8907,8 +8897,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX908-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc ; GFX908-NEXT: v_perm_b32 v6, v6, v5, s15 -; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -8982,8 +8972,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v5, 16 -; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -9047,13 +9037,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v10 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v11 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; GFX7-NEXT: v_mov_b32_e32 v9, v6 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 ; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_mov_b32_e32 v9, v6 ; GFX7-NEXT: v_mov_b32_e32 v8, v5 ; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -9121,13 +9111,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7 ; GFX6-NEXT: v_min_f32_e32 v4, v4, v9 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_f32_e32 v7, v7, v10 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16 ; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_cond.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_cond.ll index 25bad218926f3..8612b95b9b44b 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_cond.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_cond.ll @@ -35,12 +35,13 @@ define i32 @buffer_fat_ptr_agent_atomic_usub_cond_ret_u32__offset__amdgpu_no_fin ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, v5, v2 ; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v0, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -136,12 +137,13 @@ define i32 @buffer_fat_ptr_agent_atomic_usub_cond_ret_u32__offset__amdgpu_no_rem ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, v5, v2 ; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v0, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -187,12 +189,13 @@ define i32 @buffer_fat_ptr_agent_atomic_usub_cond_ret_u32__offset__amdgpu_no_fin ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, v5, v2 ; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v0, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -243,12 +246,13 @@ define i32 @buffer_fat_ptr_system_atomic_usub_cond_ret_u32__offset__amdgpu_no_fi ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, v5, v2 ; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v0, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv diff --git a/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll b/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll index 96b71cf85c8b2..bfd0c405fe1a6 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll @@ -11,10 +11,10 @@ define amdgpu_kernel void @buffer_ptr_vector_ops(ptr addrspace(1) %somewhere) { ; GISEL-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GISEL-NEXT: v_mov_b32_e32 v5, s5 ; GISEL-NEXT: v_mov_b32_e32 v6, s6 ; GISEL-NEXT: v_mov_b32_e32 v7, s7 @@ -68,9 +68,9 @@ define amdgpu_kernel void @buffer_structs(%fat_buffer_struct %arg, ptr addrspace ; GISEL-NEXT: s_ashr_i32 s7, s6, 31 ; GISEL-NEXT: s_lshl_b64 s[4:5], s[6:7], 5 ; GISEL-NEXT: s_add_u32 s4, s8, s4 -; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v4, s6 ; GISEL-NEXT: s_addc_u32 s5, s9, s5 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index 37f4094806637..62599aa9d7d08 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -274,9 +274,9 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index c407f7645315d..3b6a4e974e9cc 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -71,11 +71,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: s_mov_b32 s32, 0 @@ -90,11 +90,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: s_mov_b32 s32, 0 @@ -109,11 +109,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 1 ; SDAG-NEXT: s_mov_b32 s32, 0 @@ -156,11 +156,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 1 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -182,11 +182,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 1 @@ -205,11 +205,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: v_bfe_i32 v0, v0, 0, 1 @@ -228,11 +228,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: v_bfe_i32 v0, v0, 0, 1 @@ -287,10 +287,10 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: v_bfe_i32 v0, v0, 0, 1 @@ -315,11 +315,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: v_and_b32_e32 v0, 1, v0 @@ -338,11 +338,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: v_and_b32_e32 v0, 1, v0 @@ -361,11 +361,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 @@ -420,10 +420,10 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 @@ -443,11 +443,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x7b ; VI-NEXT: s_mov_b32 s32, 0 @@ -462,11 +462,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0x7b ; CI-NEXT: s_mov_b32 s32, 0 @@ -481,11 +481,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0x7b ; SDAG-NEXT: s_mov_b32 s32, 0 @@ -539,11 +539,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -566,11 +566,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -588,11 +588,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -610,11 +610,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -666,10 +666,10 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -692,11 +692,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -714,11 +714,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -736,11 +736,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -792,10 +792,10 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -814,11 +814,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x7b ; VI-NEXT: s_mov_b32 s32, 0 @@ -833,11 +833,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0x7b ; CI-NEXT: s_mov_b32 s32, 0 @@ -852,11 +852,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0x7b ; SDAG-NEXT: s_mov_b32 s32, 0 @@ -910,11 +910,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -936,11 +936,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -958,11 +958,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -980,11 +980,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1036,10 +1036,10 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1062,11 +1062,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1084,11 +1084,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1106,11 +1106,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1162,10 +1162,10 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1184,11 +1184,11 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 s32, 0 @@ -1203,11 +1203,11 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 s32, 0 @@ -1222,11 +1222,11 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 42 ; SDAG-NEXT: s_mov_b32 s32, 0 @@ -1269,11 +1269,11 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 42 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1291,11 +1291,11 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x7b ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -1311,11 +1311,11 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0x7b ; CI-NEXT: v_mov_b32_e32 v1, 0 @@ -1331,11 +1331,11 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0x7b ; SDAG-NEXT: v_mov_b32_e32 v1, 0 @@ -1380,12 +1380,12 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1408,10 +1408,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1430,10 +1430,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1452,10 +1452,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; SDAG-NEXT: s_mov_b32 s2, -1 ; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1532,11 +1532,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: v_mov_b32_e32 v1, 2 @@ -1554,11 +1554,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 @@ -1576,11 +1576,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 1 ; SDAG-NEXT: v_mov_b32_e32 v1, 2 @@ -1630,7 +1630,6 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 @@ -1638,6 +1637,7 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; GISEL-NEXT: v_mov_b32_e32 v1, 2 ; GISEL-NEXT: v_mov_b32_e32 v2, 3 ; GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1660,10 +1660,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v4, 1 ; VI-NEXT: v_mov_b32_e32 v5, 2 @@ -1684,10 +1684,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v4, 1 ; CI-NEXT: v_mov_b32_e32 v5, 2 @@ -1708,10 +1708,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; SDAG-NEXT: s_mov_b32 s2, -1 ; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v4, 1 ; SDAG-NEXT: v_mov_b32_e32 v5, 2 @@ -1802,10 +1802,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v4, 1 ; VI-NEXT: v_mov_b32_e32 v5, 2 @@ -1828,10 +1828,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v4, 1 ; CI-NEXT: v_mov_b32_e32 v5, 2 @@ -1854,10 +1854,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; SDAG-NEXT: s_mov_b32 s2, -1 ; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v4, 1 ; SDAG-NEXT: v_mov_b32_e32 v5, 2 @@ -1927,9 +1927,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-NEXT: v_mov_b32_e32 v3, s3 -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: v_mov_b32_e32 v6, 3 ; GISEL-NEXT: v_mov_b32_e32 v7, 4 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1949,11 +1949,11 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x4400 ; VI-NEXT: s_mov_b32 s32, 0 @@ -1968,11 +1968,11 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 4.0 ; CI-NEXT: s_mov_b32 s32, 0 @@ -1987,11 +1987,11 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0x4400 ; SDAG-NEXT: s_mov_b32 s32, 0 @@ -2045,11 +2045,11 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x4400 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2067,11 +2067,11 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 4.0 ; VI-NEXT: s_mov_b32 s32, 0 @@ -2086,11 +2086,11 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 4.0 ; CI-NEXT: s_mov_b32 s32, 0 @@ -2105,11 +2105,11 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 4.0 ; SDAG-NEXT: s_mov_b32 s32, 0 @@ -2152,11 +2152,11 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 4.0 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2174,11 +2174,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2194,11 +2194,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1.0 ; CI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2214,11 +2214,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; SDAG-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2263,12 +2263,12 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GISEL-NEXT: v_mov_b32_e32 v1, 2.0 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2286,11 +2286,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2307,11 +2307,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1.0 ; CI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2328,11 +2328,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; SDAG-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2380,13 +2380,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GISEL-NEXT: v_mov_b32_e32 v1, 2.0 ; GISEL-NEXT: v_mov_b32_e32 v2, 4.0 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2404,11 +2404,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2427,11 +2427,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1.0 ; CI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2450,11 +2450,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; SDAG-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2507,7 +2507,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 @@ -2516,6 +2515,7 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; GISEL-NEXT: v_mov_b32_e32 v2, 4.0 ; GISEL-NEXT: v_mov_b32_e32 v3, -1.0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0.5 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2533,11 +2533,11 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0x40100000 @@ -2553,11 +2553,11 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0x40100000 @@ -2573,11 +2573,11 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v1, 0x40100000 @@ -2622,12 +2622,12 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2645,11 +2645,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2667,11 +2667,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2689,11 +2689,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2743,7 +2743,6 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 @@ -2751,6 +2750,7 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; GISEL-NEXT: v_mov_b32_e32 v1, 2.0 ; GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GISEL-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2768,11 +2768,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2792,11 +2792,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2816,11 +2816,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2875,7 +2875,6 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 @@ -2885,6 +2884,7 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; GISEL-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: v_mov_b32_e32 v5, 0x40200000 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2905,11 +2905,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2927,10 +2927,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2949,11 +2949,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; SDAG-NEXT: s_mov_b32 s2, -1 ; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2994,16 +2994,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GISEL-NEXT: s_mov_b32 s38, -1 ; GISEL-NEXT: s_load_dword s8, s[0:1], 0x0 +; GISEL-NEXT: s_mov_b32 s38, -1 ; GISEL-NEXT: s_mov_b32 s39, 0xe00000 ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s8 @@ -3028,10 +3028,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3049,10 +3049,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; CI-NEXT: buffer_load_dwordx2 v[3:4], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3074,10 +3074,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 ; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3124,13 +3124,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; GISEL-NEXT: s_mov_b32 s39, 0xe00000 ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] -; GISEL-NEXT: s_getpc_b64 s[4:5] -; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 -; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3153,10 +3153,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3174,10 +3174,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; CI-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3200,10 +3200,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 ; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3250,13 +3250,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; GISEL-NEXT: s_mov_b32 s39, 0xe00000 ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] -; GISEL-NEXT: s_getpc_b64 s[4:5] -; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 -; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3275,11 +3275,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x20001 ; VI-NEXT: v_mov_b32_e32 v1, 3 @@ -3295,11 +3295,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 @@ -3316,11 +3316,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0x20001 ; SDAG-NEXT: v_mov_b32_e32 v1, 3 @@ -3365,12 +3365,12 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x20001 ; GISEL-NEXT: v_mov_b32_e32 v1, 3 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3388,11 +3388,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; VI-NEXT: v_mov_b32_e32 v1, 0x4400 @@ -3408,11 +3408,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1.0 ; CI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -3429,11 +3429,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; SDAG-NEXT: v_mov_b32_e32 v1, 0x4400 @@ -3479,12 +3479,12 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GISEL-NEXT: v_mov_b32_e32 v1, 0x4400 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3506,10 +3506,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3527,10 +3527,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3553,10 +3553,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 ; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3603,13 +3603,13 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; GISEL-NEXT: s_mov_b32 s39, 0xe00000 ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] -; GISEL-NEXT: s_getpc_b64 s[4:5] -; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 -; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3628,11 +3628,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x20001 ; VI-NEXT: v_mov_b32_e32 v1, 0x40003 @@ -3648,11 +3648,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 @@ -3670,11 +3670,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0x20001 ; SDAG-NEXT: v_mov_b32_e32 v1, 0x40003 @@ -3720,12 +3720,12 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x20001 ; GISEL-NEXT: v_mov_b32_e32 v1, 0x40003 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3746,11 +3746,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3767,11 +3767,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3792,11 +3792,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; SDAG-NEXT: s_mov_b32 s2, -1 ; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3837,16 +3837,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GISEL-NEXT: s_mov_b32 s38, -1 ; GISEL-NEXT: s_load_dword s8, s[0:1], 0x0 +; GISEL-NEXT: s_mov_b32 s38, -1 ; GISEL-NEXT: s_mov_b32 s39, 0xe00000 ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s8 @@ -3871,10 +3871,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3892,10 +3892,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3913,10 +3913,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 ; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3963,13 +3963,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; GISEL-NEXT: s_mov_b32 s39, 0xe00000 ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] -; GISEL-NEXT: s_getpc_b64 s[4:5] -; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 -; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3988,11 +3988,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: v_mov_b32_e32 v1, 2 @@ -4008,11 +4008,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 @@ -4028,11 +4028,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 1 ; SDAG-NEXT: v_mov_b32_e32 v1, 2 @@ -4077,12 +4077,12 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 1 ; GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4100,11 +4100,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 3 ; VI-NEXT: v_mov_b32_e32 v1, 4 @@ -4121,11 +4121,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 3 ; CI-NEXT: v_mov_b32_e32 v1, 4 @@ -4142,11 +4142,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 3 ; SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -4194,13 +4194,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 3 ; GISEL-NEXT: v_mov_b32_e32 v1, 4 ; GISEL-NEXT: v_mov_b32_e32 v2, 5 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4218,11 +4218,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 3 ; VI-NEXT: v_mov_b32_e32 v1, 4 @@ -4240,11 +4240,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 3 ; CI-NEXT: v_mov_b32_e32 v1, 4 @@ -4262,11 +4262,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 3 ; SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -4316,7 +4316,6 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 @@ -4324,6 +4323,7 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GISEL-NEXT: v_mov_b32_e32 v1, 4 ; GISEL-NEXT: v_mov_b32_e32 v2, 5 ; GISEL-NEXT: v_mov_b32_e32 v3, 6 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4345,10 +4345,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4366,10 +4366,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4387,10 +4387,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 ; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4464,11 +4464,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: v_mov_b32_e32 v1, 2 @@ -4486,11 +4486,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 @@ -4508,11 +4508,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 1 ; SDAG-NEXT: v_mov_b32_e32 v1, 2 @@ -4562,7 +4562,6 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 @@ -4570,6 +4569,7 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; GISEL-NEXT: v_mov_b32_e32 v1, 2 ; GISEL-NEXT: v_mov_b32_e32 v2, 3 ; GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4587,11 +4587,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: v_mov_b32_e32 v1, 2 @@ -4610,11 +4610,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 @@ -4633,11 +4633,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 1 ; SDAG-NEXT: v_mov_b32_e32 v1, 2 @@ -4690,7 +4690,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 @@ -4699,6 +4698,7 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; GISEL-NEXT: v_mov_b32_e32 v2, 3 ; GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GISEL-NEXT: v_mov_b32_e32 v4, 5 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4723,10 +4723,10 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4747,10 +4747,10 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4771,10 +4771,10 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4830,10 +4830,10 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x0 ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 @@ -4862,11 +4862,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: v_mov_b32_e32 v1, 2 @@ -4888,11 +4888,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 @@ -4914,11 +4914,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 1 ; SDAG-NEXT: v_mov_b32_e32 v1, 2 @@ -4978,7 +4978,6 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 @@ -4990,6 +4989,7 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; GISEL-NEXT: v_mov_b32_e32 v5, 6 ; GISEL-NEXT: v_mov_b32_e32 v6, 7 ; GISEL-NEXT: v_mov_b32_e32 v7, 8 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -5016,10 +5016,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -5042,10 +5042,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; CI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -5068,10 +5068,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; SDAG-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; SDAG-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -5131,10 +5131,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0 ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 @@ -5183,12 +5183,12 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[8:9] ; VI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32 @@ -5215,12 +5215,12 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[8:9] ; CI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_waitcnt vmcnt(6) ; CI-NEXT: buffer_store_dword v31, off, s[36:39], s32 @@ -5247,12 +5247,12 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_getpc_b64 s[8:9] ; SDAG-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_waitcnt vmcnt(6) ; SDAG-NEXT: buffer_store_dword v31, off, s[36:39], s32 @@ -5325,11 +5325,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0 ; GISEL-NEXT: s_mov_b32 s55, 0xe00000 ; GISEL-NEXT: s_add_u32 s52, s52, s3 -; GISEL-NEXT: s_addc_u32 s53, s53, 0 ; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_addc_u32 s53, s53, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s23 -; GISEL-NEXT: s_mov_b64 s[0:1], s[52:53] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 @@ -5354,6 +5353,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; GISEL-NEXT: v_mov_b32_e32 v17, s9 ; GISEL-NEXT: v_mov_b32_e32 v18, s10 ; GISEL-NEXT: v_mov_b32_e32 v19, s11 +; GISEL-NEXT: s_mov_b64 s[0:1], s[52:53] ; GISEL-NEXT: s_mov_b64 s[2:3], s[54:55] ; GISEL-NEXT: v_mov_b32_e32 v20, s12 ; GISEL-NEXT: v_mov_b32_e32 v21, s13 @@ -5396,12 +5396,12 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 ; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 @@ -5431,12 +5431,12 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 ; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_waitcnt vmcnt(8) ; CI-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 @@ -5466,12 +5466,12 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; SDAG-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; SDAG-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 ; SDAG-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_waitcnt vmcnt(8) ; SDAG-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 @@ -5558,11 +5558,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GISEL-NEXT: s_addc_u32 s53, s53, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: ; kill: killed $sgpr0_sgpr1 -; GISEL-NEXT: ; kill: killed $sgpr0_sgpr1 ; GISEL-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 ; GISEL-NEXT: v_mov_b32_e32 v0, s23 -; GISEL-NEXT: s_mov_b64 s[0:1], s[52:53] +; GISEL-NEXT: ; kill: killed $sgpr0_sgpr1 +; GISEL-NEXT: ; kill: killed $sgpr0_sgpr1 ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 @@ -5587,6 +5586,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GISEL-NEXT: v_mov_b32_e32 v17, s9 ; GISEL-NEXT: v_mov_b32_e32 v18, s10 ; GISEL-NEXT: v_mov_b32_e32 v19, s11 +; GISEL-NEXT: s_mov_b64 s[0:1], s[52:53] ; GISEL-NEXT: s_mov_b64 s[2:3], s[54:55] ; GISEL-NEXT: v_mov_b32_e32 v20, s12 ; GISEL-NEXT: v_mov_b32_e32 v21, s13 @@ -5615,14 +5615,14 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; VI-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s50, -1 ; VI-NEXT: s_mov_b32 s51, 0xe80000 -; VI-NEXT: s_add_u32 s48, s48, s5 ; VI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24 +; VI-NEXT: s_add_u32 s48, s48, s5 ; VI-NEXT: s_addc_u32 s49, s49, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[48:49] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[48:49] ; VI-NEXT: s_mov_b64 s[2:3], s[50:51] ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 s32, 0 @@ -5639,14 +5639,14 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; CI-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s50, -1 ; CI-NEXT: s_mov_b32 s51, 0xe8f000 -; CI-NEXT: s_add_u32 s48, s48, s5 ; CI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; CI-NEXT: s_add_u32 s48, s48, s5 ; CI-NEXT: s_addc_u32 s49, s49, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[48:49] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[48:49] ; CI-NEXT: s_mov_b64 s[2:3], s[50:51] ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 s32, 0 @@ -5663,14 +5663,14 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; SDAG-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; SDAG-NEXT: s_mov_b32 s50, -1 ; SDAG-NEXT: s_mov_b32 s51, 0xe00000 -; SDAG-NEXT: s_add_u32 s48, s48, s5 ; SDAG-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24 +; SDAG-NEXT: s_add_u32 s48, s48, s5 ; SDAG-NEXT: s_addc_u32 s49, s49, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[48:49] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[48:49] ; SDAG-NEXT: s_mov_b64 s[2:3], s[50:51] ; SDAG-NEXT: v_mov_b32_e32 v0, 42 ; SDAG-NEXT: s_mov_b32 s32, 0 @@ -5724,15 +5724,15 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; GISEL-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GISEL-NEXT: s_mov_b32 s50, -1 ; GISEL-NEXT: s_mov_b32 s51, 0xe00000 -; GISEL-NEXT: s_add_u32 s48, s48, s5 ; GISEL-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24 +; GISEL-NEXT: s_add_u32 s48, s48, s5 ; GISEL-NEXT: s_addc_u32 s49, s49, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[48:49] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 42 +; GISEL-NEXT: s_mov_b64 s[0:1], s[48:49] ; GISEL-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -5762,10 +5762,10 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -5786,10 +5786,10 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; CI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -5810,10 +5810,10 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; SDAG-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; SDAG-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -5869,10 +5869,10 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[8:9] ; GISEL-NEXT: s_add_u32 s8, s8, external_void_func_struct_i8_i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s9, s9, external_void_func_struct_i8_i32@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s4 @@ -5901,12 +5901,12 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; VI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4 ; VI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 ; VI-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_movk_i32 s32, 0x400 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 @@ -5929,12 +5929,12 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; CI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4 ; CI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 ; CI-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_movk_i32 s32, 0x400 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 @@ -5958,12 +5958,12 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; SDAG-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_movk_i32 s32, 0x400 ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_waitcnt vmcnt(1) ; SDAG-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 @@ -6047,12 +6047,12 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; GISEL-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:4 -; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_movk_i32 s32, 0x400 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_waitcnt vmcnt(1) ; GISEL-NEXT: buffer_store_dword v0, off, s[36:39], s32 @@ -6086,10 +6086,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; VI-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; VI-NEXT: s_movk_i32 s32, 0x800 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 @@ -6124,10 +6124,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; CI-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; CI-NEXT: s_movk_i32 s32, 0x800 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 @@ -6163,10 +6163,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; SDAG-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; SDAG-NEXT: s_movk_i32 s32, 0x800 ; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_waitcnt vmcnt(1) ; SDAG-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 @@ -6300,10 +6300,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; GISEL-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:4 ; GISEL-NEXT: s_movk_i32 s32, 0x800 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_waitcnt vmcnt(1) ; GISEL-NEXT: buffer_store_dword v0, off, s[36:39], s32 @@ -6353,10 +6353,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6395,10 +6395,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -6437,10 +6437,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -6562,26 +6562,26 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; GISEL-NEXT: s_lshr_b32 s17, s3, 8 ; GISEL-NEXT: s_lshr_b32 s18, s3, 16 ; GISEL-NEXT: s_lshr_b32 s19, s3, 24 -; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v4, s1 -; GISEL-NEXT: v_mov_b32_e32 v8, s2 -; GISEL-NEXT: v_mov_b32_e32 v12, s3 -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s8 ; GISEL-NEXT: v_mov_b32_e32 v2, s9 ; GISEL-NEXT: v_mov_b32_e32 v3, s10 +; GISEL-NEXT: v_mov_b32_e32 v4, s1 ; GISEL-NEXT: v_mov_b32_e32 v5, s11 ; GISEL-NEXT: v_mov_b32_e32 v6, s12 ; GISEL-NEXT: v_mov_b32_e32 v7, s13 +; GISEL-NEXT: v_mov_b32_e32 v8, s2 ; GISEL-NEXT: v_mov_b32_e32 v9, s14 ; GISEL-NEXT: v_mov_b32_e32 v10, s15 ; GISEL-NEXT: v_mov_b32_e32 v11, s16 +; GISEL-NEXT: v_mov_b32_e32 v12, s3 ; GISEL-NEXT: v_mov_b32_e32 v13, s17 ; GISEL-NEXT: v_mov_b32_e32 v14, s18 ; GISEL-NEXT: v_mov_b32_e32 v15, s19 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GISEL-NEXT: s_endpgm @@ -6610,12 +6610,12 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 ; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[52:53] ; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[52:53] ; VI-NEXT: s_mov_b64 s[2:3], s[54:55] ; VI-NEXT: v_mov_b32_e32 v0, s36 ; VI-NEXT: v_mov_b32_e32 v1, s37 @@ -6669,12 +6669,12 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 ; CI-NEXT: v_mov_b32_e32 v0, s5 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[52:53] ; CI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[52:53] ; CI-NEXT: s_mov_b64 s[2:3], s[54:55] ; CI-NEXT: v_mov_b32_e32 v0, s36 ; CI-NEXT: v_mov_b32_e32 v1, s37 @@ -6728,12 +6728,12 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; SDAG-NEXT: v_mov_b32_e32 v0, s4 ; SDAG-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 ; SDAG-NEXT: v_mov_b32_e32 v0, s5 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[52:53] ; SDAG-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[52:53] ; SDAG-NEXT: s_mov_b64 s[2:3], s[54:55] ; SDAG-NEXT: v_mov_b32_e32 v0, s36 ; SDAG-NEXT: v_mov_b32_e32 v1, s37 @@ -6883,7 +6883,6 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 ; GISEL-NEXT: v_mov_b32_e32 v0, s1 -; GISEL-NEXT: s_mov_b64 s[0:1], s[52:53] ; GISEL-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 @@ -6908,6 +6907,7 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; GISEL-NEXT: v_mov_b32_e32 v17, s9 ; GISEL-NEXT: v_mov_b32_e32 v18, s10 ; GISEL-NEXT: v_mov_b32_e32 v19, s11 +; GISEL-NEXT: s_mov_b64 s[0:1], s[52:53] ; GISEL-NEXT: s_mov_b64 s[2:3], s[54:55] ; GISEL-NEXT: v_mov_b32_e32 v20, s12 ; GISEL-NEXT: v_mov_b32_e32 v21, s13 diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index ef5438e63f667..644a903138de1 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -245,14 +245,14 @@ define amdgpu_kernel void @call_coldcc() #0 { ; VI-NEXT: s_addc_u32 s5, s5, coldcc@gotpcrel32@hi+12 ; VI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b64 s[10:11], s[6:7] ; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b64 s[10:11], s[6:7] +; VI-NEXT: v_or_b32_e32 v31, v0, v2 ; VI-NEXT: s_mov_b64 s[4:5], s[0:1] ; VI-NEXT: s_mov_b64 s[6:7], s[2:3] ; VI-NEXT: s_mov_b64 s[0:1], s[88:89] -; VI-NEXT: v_or_b32_e32 v31, v0, v2 ; VI-NEXT: s_mov_b64 s[2:3], s[90:91] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: s_mov_b32 s32, 0 @@ -357,14 +357,14 @@ define amdgpu_kernel void @call_fastcc() #0 { ; VI-NEXT: s_addc_u32 s5, s5, fastcc@gotpcrel32@hi+12 ; VI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b64 s[10:11], s[6:7] ; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b64 s[10:11], s[6:7] +; VI-NEXT: v_or_b32_e32 v31, v0, v2 ; VI-NEXT: s_mov_b64 s[4:5], s[0:1] ; VI-NEXT: s_mov_b64 s[6:7], s[2:3] ; VI-NEXT: s_mov_b64 s[0:1], s[88:89] -; VI-NEXT: v_or_b32_e32 v31, v0, v2 ; VI-NEXT: s_mov_b64 s[2:3], s[90:91] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: s_mov_b32 s32, 0 diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index 163d7ff9c61fc..bcf15280a7434 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -110,8 +110,9 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s2, s4 ; GFX11-NEXT: s_addc_u32 s3, s3, s5 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -222,8 +223,9 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s2, 0x56789876 ; GFX11-NEXT: s_addc_u32 s3, s3, 0x1234 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -1087,8 +1089,9 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s2, s2, s4 ; GFX11-NEXT: s_subb_u32 s3, s3, s5 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -1199,8 +1202,9 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s2, 0x56789876, s2 ; GFX11-NEXT: s_subb_u32 s3, 0x1234, s3 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -2420,8 +2424,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: s_cmp_ge_u32 s2, s6 ; GFX9-NEXT: s_cselect_b32 s8, s4, s3 ; GFX9-NEXT: .LBB16_3: -; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm @@ -2573,8 +2577,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_cmp_ge_u32 s2, s6 ; GFX1010-NEXT: s_cselect_b32 s8, s4, s3 ; GFX1010-NEXT: .LBB16_3: -; GFX1010-NEXT: v_mov_b32_e32 v0, s8 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-NEXT: v_mov_b32_e32 v0, s8 ; GFX1010-NEXT: v_mov_b32_e32 v1, s9 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-NEXT: s_endpgm @@ -2726,8 +2730,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_cmp_ge_u32 s2, s4 ; GFX1030W32-NEXT: s_cselect_b32 s8, s5, s3 ; GFX1030W32-NEXT: .LBB16_3: -; GFX1030W32-NEXT: v_mov_b32_e32 v0, s8 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX1030W32-NEXT: v_mov_b32_e32 v0, s8 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s9 ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1030W32-NEXT: s_endpgm @@ -2878,8 +2882,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: s_cmp_ge_u32 s2, s4 ; GFX1030W64-NEXT: s_cselect_b32 s6, s5, s3 ; GFX1030W64-NEXT: .LBB16_3: -; GFX1030W64-NEXT: v_mov_b32_e32 v0, s6 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX1030W64-NEXT: v_mov_b32_e32 v0, s6 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s7 ; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1030W64-NEXT: s_endpgm @@ -3046,9 +3050,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_cmp_ge_u32 s2, s4 ; GFX11-NEXT: s_cselect_b32 s8, s5, s3 ; GFX11-NEXT: .LBB16_3: -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB16_4: diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll index da5e73199a223..98b6fa018a70a 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -54,13 +54,13 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; GFX10-NEXT: s_add_u32 s4, s0, 8 ; GFX10-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-NEXT: s_add_u32 s6, s0, 16 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_addc_u32 s7, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_add_u32 s0, s0, 24 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, s6 ; GFX10-NEXT: v_mov_b32_e32 v5, s7 ; GFX10-NEXT: v_mov_b32_e32 v7, s1 @@ -72,11 +72,11 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; GFX10-NEXT: flat_load_dword v11, v[6:7] ; GFX10-NEXT: s_add_u32 s0, s2, 8 ; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_add_u32 s0, s2, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: s_add_u32 s2, s2, 24 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 @@ -179,13 +179,13 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s4, s0, 8 ; GFX10-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: s_add_u32 s6, s0, 16 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_addc_u32 s7, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_add_u32 s0, s0, 24 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, s6 ; GFX10-NEXT: v_mov_b32_e32 v5, s7 @@ -199,18 +199,18 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; GFX10-NEXT: s_add_u32 s0, s2, 8 ; GFX10-NEXT: s_addc_u32 s1, s3, 0 ; GFX10-NEXT: s_add_u32 s4, s2, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: s_addc_u32 s5, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_add_u32 s0, s2, 24 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 ; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v6 -; GFX10-NEXT: v_mov_b32_e32 v7, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s0 ; GFX10-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) ; GFX10-NEXT: flat_store_dword v[0:1], v8 diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll index a99aab7a23a3b..bf9bae774fd50 100644 --- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll +++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,DAGISEL-ASM ; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,GISEL-ASM + ; Tests that we can avoid nullptr checks for addrspacecasts from/to priv/local. ; ; Whenever a testcase is successful, we should see the addrspacecast replaced with the intrinsic @@ -228,8 +229,8 @@ define void @private_alloca_to_flat(ptr %ptr) { ; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-ASM-NEXT: s_mov_b64 s[4:5], src_private_base ; GISEL-ASM-NEXT: s_lshr_b32 s4, s32, 6 -; GISEL-ASM-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7 +; GISEL-ASM-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2 ; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index ce541dd2954f4..f8771cee537ca 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -1027,10 +1027,7 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-NEXT: s_cbranch_execz .LBB5_1 ; GCN-NEXT: ; %bb.6: ; %bb8 ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 -; GCN-NEXT: v_mov_b32_e32 v5, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_branch .LBB5_1 ; GCN-NEXT: .LBB5_7: ; %bb12 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] @@ -1177,10 +1174,13 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_mov_b32 s5, s10 ; GCN-O0-NEXT: s_mov_b32 s6, s9 ; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_waitcnt expcnt(1) +; GCN-O0-NEXT: s_waitcnt expcnt(4) ; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: s_waitcnt expcnt(3) ; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: s_waitcnt expcnt(2) ; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: s_waitcnt expcnt(1) ; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir b/llvm/test/CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir index 46a72c032827c..4e87905f464f1 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir +++ b/llvm/test/CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir @@ -15,10 +15,10 @@ body: | ; CHECK-LABEL: name: overlapping_copy_kill_undef_reg_after_copy ; CHECK: liveins: $sgpr30_sgpr31, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; CHECK-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 - ; CHECK-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 - ; CHECK-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 + ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5 + ; CHECK-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr6_sgpr7 + ; CHECK-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr8_sgpr9 + ; CHECK-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr10_sgpr11 ; CHECK-NEXT: renamable $sgpr1 = S_ADD_I32 0, $sgpr1, implicit-def $scc ; CHECK-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 @@ -37,8 +37,8 @@ body: | ; CHECK-LABEL: name: nonoverlapping_copy_kill ; CHECK: liveins: $sgpr30_sgpr31, $sgpr4_sgpr5_sgpr6 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr4_sgpr5_sgpr6, implicit-def $sgpr0_sgpr1_sgpr2 - ; CHECK-NEXT: $sgpr2 = S_MOV_B32 $sgpr6, implicit killed $sgpr4_sgpr5_sgpr6 + ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 killed $sgpr4_sgpr5 + ; CHECK-NEXT: $sgpr2 = S_MOV_B32 killed $sgpr6 ; CHECK-NEXT: renamable $sgpr1 = S_ADD_I32 0, $sgpr1, implicit-def $scc ; CHECK-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 renamable $sgpr0_sgpr1_sgpr2 = COPY killed renamable $sgpr4_sgpr5_sgpr6 diff --git a/llvm/test/CodeGen/AMDGPU/copy-overlap-vgpr-kill.mir b/llvm/test/CodeGen/AMDGPU/copy-overlap-vgpr-kill.mir index 5efeb8d40afbb..b18fa12ae782f 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-overlap-vgpr-kill.mir +++ b/llvm/test/CodeGen/AMDGPU/copy-overlap-vgpr-kill.mir @@ -15,9 +15,9 @@ body: | ; CHECK-LABEL: name: overlapping_copy_kill_undef_reg_after_copy ; CHECK: liveins: $sgpr30_sgpr31, $vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr1_vgpr2_vgpr3 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr1_vgpr2_vgpr3 - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec ; CHECK-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F32_e32 0, $vgpr1, implicit $mode, implicit $exec ; CHECK-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 renamable $vgpr0_vgpr1_vgpr2 = COPY killed renamable $vgpr1_vgpr2_vgpr3 @@ -36,9 +36,9 @@ body: | ; CHECK-LABEL: name: overlapping_copy_kill_undef_reg_after_copy_1 ; CHECK: liveins: $sgpr30_sgpr31, $vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr2_vgpr3_vgpr4 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4 - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec ; CHECK-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F32_e32 0, $vgpr1, implicit $mode, implicit $exec ; CHECK-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 renamable $vgpr0_vgpr1_vgpr2 = COPY killed renamable $vgpr2_vgpr3_vgpr4 @@ -57,9 +57,9 @@ body: | ; CHECK-LABEL: name: nonoverlapping_copy_kill ; CHECK: liveins: $sgpr30_sgpr31, $vgpr3_vgpr4_vgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr3_vgpr4_vgpr5 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr3_vgpr4_vgpr5 - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit killed $vgpr3_vgpr4_vgpr5 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr3, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr4, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr5, implicit $exec ; CHECK-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F32_e32 0, $vgpr1, implicit $mode, implicit $exec ; CHECK-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 renamable $vgpr0_vgpr1_vgpr2 = COPY killed renamable $vgpr3_vgpr4_vgpr5 @@ -78,10 +78,10 @@ body: | ; CHECK-LABEL: name: overlapping_copy_kill_half_s128 ; CHECK: liveins: $sgpr30_sgpr31, $vgpr2_vgpr3_vgpr4_vgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec + ; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec ; CHECK-NEXT: renamable $vgpr1 = V_OR_B32_e32 1, $vgpr1, implicit $exec ; CHECK-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir b/llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir index 9376a4c59c170..c82539c48e6f9 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir +++ b/llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir @@ -14,11 +14,10 @@ body: | ; CHECK-LABEL: name: copy_has_implicit_kill_superreg ; CHECK: renamable $vgpr7_vgpr8_vgpr9_vgpr10 = IMPLICIT_DEF - ; CHECK-NEXT: $vgpr7 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit-def $vgpr7_vgpr8, implicit $vgpr10_vgpr11 - ; CHECK-NEXT: $vgpr8 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr10_vgpr11, implicit $vgpr7_vgpr8_vgpr9_vgpr10 + ; CHECK-NEXT: $vgpr7 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr7 renamable $vgpr7_vgpr8_vgpr9_vgpr10 = IMPLICIT_DEF - renamable $vgpr7_vgpr8 = COPY killed renamable $vgpr10_vgpr11, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10 + renamable $vgpr7_vgpr8 = COPY_LANEMASK killed renamable $vgpr10_vgpr11, lanemask(3), implicit killed $vgpr7_vgpr8_vgpr9_vgpr10 S_ENDPGM 0, implicit $vgpr7 ... diff --git a/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir index cc976fe13c47c..3e987973920e0 100644 --- a/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir +++ b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir @@ -15,13 +15,13 @@ body: | ; GFX908-LABEL: name: copy_v64_to_v64 ; GFX908: liveins: $vgpr2_vgpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr2, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr3, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v64_to_v64 ; GFX90A: liveins: $vgpr2_vgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, killed $vgpr2_vgpr3, 12, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v64_to_v64 ; GFX942: liveins: $vgpr2_vgpr3 @@ -31,8 +31,9 @@ body: | ; GFX10-LABEL: name: copy_v64_to_v64 ; GFX10: liveins: $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr2, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr3, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v64_to_v64 ; GFX1250: liveins: $vgpr2_vgpr3 ; GFX1250-NEXT: {{ $}} @@ -49,13 +50,13 @@ body: | ; GFX908-LABEL: name: copy_s64_to_v64 ; GFX908: liveins: $sgpr2_sgpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_s64_to_v64 ; GFX90A: liveins: $sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr2_sgpr3, 12, $sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, killed $sgpr2_sgpr3, 12, killed $sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_s64_to_v64 ; GFX942: liveins: $sgpr2_sgpr3 @@ -65,8 +66,9 @@ body: | ; GFX10-LABEL: name: copy_s64_to_v64 ; GFX10: liveins: $sgpr2_sgpr3 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_s64_to_v64 ; GFX1250: liveins: $sgpr2_sgpr3 ; GFX1250-NEXT: {{ $}} @@ -83,31 +85,32 @@ body: | ; GFX908-LABEL: name: copy_a64_to_v64 ; GFX908: liveins: $agpr2_agpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_a64_to_v64 ; GFX90A: liveins: $agpr2_agpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 - ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_a64_to_v64 ; GFX942: liveins: $agpr2_agpr3 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 - ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_a64_to_v64 ; GFX10: liveins: $agpr2_agpr3 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 - ; GFX10-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_a64_to_v64 ; GFX1250: liveins: $agpr2_agpr3 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 - ; GFX1250-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec $vgpr0_vgpr1 = COPY killed $agpr2_agpr3, implicit $exec ... @@ -120,35 +123,36 @@ body: | ; GFX908-LABEL: name: copy_v128_to_v128_fwd ; GFX908: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v128_to_v128_fwd ; GFX90A: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr4_vgpr5, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr4_vgpr5, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v128_to_v128_fwd ; GFX942: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr4_vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr4_vgpr5, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v128_to_v128_fwd ; GFX10: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec + ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v128_to_v128_fwd ; GFX1250: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX1250-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr4_vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec + ; GFX1250-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec + ; GFX1250-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr4_vgpr5, implicit $exec, implicit $exec $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec ... @@ -161,35 +165,36 @@ body: | ; GFX908-LABEL: name: copy_v128_to_v128_back ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX908-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v128_to_v128_back ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v128_to_v128_back ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr0_vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr0_vgpr1, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v128_to_v128_back ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX10-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX10-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; GFX10-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v128_to_v128_back ; GFX1250: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX1250-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr0_vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX1250-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec + ; GFX1250-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr0_vgpr1, implicit $exec, implicit $exec $vgpr2_vgpr3_vgpr4_vgpr5 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ... @@ -202,36 +207,37 @@ body: | ; GFX908-LABEL: name: copy_v96_to_v96 ; GFX908: liveins: $vgpr4_vgpr5_vgpr6 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr5, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr6, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v96_to_v96 ; GFX90A: liveins: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6 - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr5, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr6, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v96_to_v96 ; GFX942: liveins: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6 - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr5, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr6, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v96_to_v96 ; GFX10: liveins: $vgpr4_vgpr5_vgpr6 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr5, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr6, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v96_to_v96 ; GFX1250: liveins: $vgpr4_vgpr5_vgpr6 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6 - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr5, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr6, implicit $exec, implicit $exec $vgpr0_vgpr1_vgpr2 = COPY killed $vgpr4_vgpr5_vgpr6, implicit $exec ... @@ -244,13 +250,12 @@ body: | ; GFX908-LABEL: name: copy_v64_to_v64_undef_sub0 ; GFX908: liveins: $vgpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr3, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub0 ; GFX90A: liveins: $vgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, killed $vgpr2_vgpr3, 12, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v64_to_v64_undef_sub0 ; GFX942: liveins: $vgpr3 @@ -260,13 +265,13 @@ body: | ; GFX10-LABEL: name: copy_v64_to_v64_undef_sub0 ; GFX10: liveins: $vgpr3 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr3, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v64_to_v64_undef_sub0 ; GFX1250: liveins: $vgpr3 ; GFX1250-NEXT: {{ $}} ; GFX1250-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec - $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, implicit $exec + $vgpr0_vgpr1 = COPY_LANEMASK killed $vgpr2_vgpr3, lanemask(12), implicit $exec ... --- @@ -278,13 +283,12 @@ body: | ; GFX908-LABEL: name: copy_v64_to_v64_undef_sub1 ; GFX908: liveins: $vgpr2 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr2, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub1 ; GFX90A: liveins: $vgpr2 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, killed $vgpr2_vgpr3, 12, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v64_to_v64_undef_sub1 ; GFX942: liveins: $vgpr2 @@ -294,13 +298,13 @@ body: | ; GFX10-LABEL: name: copy_v64_to_v64_undef_sub1 ; GFX10: liveins: $vgpr2 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr2, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v64_to_v64_undef_sub1 ; GFX1250: liveins: $vgpr2 ; GFX1250-NEXT: {{ $}} ; GFX1250-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec - $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, implicit $exec + $vgpr0_vgpr1 = COPY_LANEMASK killed $vgpr2_vgpr3, lanemask(3), implicit $exec ... --- @@ -312,35 +316,36 @@ body: | ; GFX908-LABEL: name: copy_s128_to_v128_killed ; GFX908: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr6, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr7, implicit $exec ; ; GFX90A-LABEL: name: copy_s128_to_v128_killed ; GFX90A: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr4_sgpr5, 12, $sgpr4_sgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $sgpr6_sgpr7, 12, $sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, killed $sgpr4_sgpr5, 12, killed $sgpr4_sgpr5, 0, 0, 0, 0, 0, implicit $exec + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, killed $sgpr6_sgpr7, 12, killed $sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec ; ; GFX942-LABEL: name: copy_s128_to_v128_killed ; GFX942: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr4_sgpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $sgpr6_sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr4_sgpr5, implicit $exec + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr6_sgpr7, implicit $exec ; ; GFX10-LABEL: name: copy_s128_to_v128_killed ; GFX10: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr6, implicit $exec + ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr7, implicit $exec + ; ; GFX1250-LABEL: name: copy_s128_to_v128_killed ; GFX1250: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr4_sgpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX1250-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $sgpr6_sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX1250-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr4_sgpr5, implicit $exec + ; GFX1250-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr6_sgpr7, implicit $exec $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $sgpr4_sgpr5_sgpr6_sgpr7 ... @@ -353,31 +358,32 @@ body: | ; GFX908-LABEL: name: copy_v64_to_v64_unaligned ; GFX908: liveins: $vgpr2_vgpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v64_to_v64_unaligned ; GFX90A: liveins: $vgpr2_vgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v64_to_v64_unaligned ; GFX942: liveins: $vgpr2_vgpr3 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v64_to_v64_unaligned ; GFX10: liveins: $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v64_to_v64_unaligned ; GFX1250: liveins: $vgpr2_vgpr3 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $exec $vgpr1_vgpr2 = COPY killed $vgpr2_vgpr3, implicit $exec ... @@ -390,31 +396,32 @@ body: | ; GFX908-LABEL: name: copy_v64_unaligned_to_v64 ; GFX908: liveins: $vgpr3_vgpr4 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr3, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v64_unaligned_to_v64 ; GFX90A: liveins: $vgpr3_vgpr4 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr3, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v64_unaligned_to_v64 ; GFX942: liveins: $vgpr3_vgpr4 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr3, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v64_unaligned_to_v64 ; GFX10: liveins: $vgpr3_vgpr4 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr3, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v64_unaligned_to_v64 ; GFX1250: liveins: $vgpr3_vgpr4 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr3, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec $vgpr0_vgpr1 = COPY killed $vgpr3_vgpr4, implicit $exec ... @@ -427,41 +434,42 @@ body: | ; GFX908-LABEL: name: copy_v128_to_v128_unaligned ; GFX908: liveins: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec + ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 killed $vgpr11, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v128_to_v128_unaligned ; GFX90A: liveins: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $vgpr11, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v128_to_v128_unaligned ; GFX942: liveins: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $vgpr11, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v128_to_v128_unaligned ; GFX10: liveins: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX10-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec + ; GFX10-NEXT: $vgpr4 = V_MOV_B32_e32 killed $vgpr11, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v128_to_v128_unaligned ; GFX1250: liveins: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX1250-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec + ; GFX1250-NEXT: $vgpr4 = V_MOV_B32_e32 killed $vgpr11, implicit $exec, implicit $exec $vgpr1_vgpr2_vgpr3_vgpr4 = COPY killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec ... @@ -474,41 +482,42 @@ body: | ; GFX908-LABEL: name: copy_v128_unaligned_to_v128 ; GFX908: liveins: $vgpr7_vgpr8_vgpr9_vgpr10 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v128_unaligned_to_v128 ; GFX90A: liveins: $vgpr7_vgpr8_vgpr9_vgpr10 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v128_unaligned_to_v128 ; GFX942: liveins: $vgpr7_vgpr8_vgpr9_vgpr10 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v128_unaligned_to_v128 ; GFX10: liveins: $vgpr7_vgpr8_vgpr9_vgpr10 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v128_unaligned_to_v128 ; GFX1250: liveins: $vgpr7_vgpr8_vgpr9_vgpr10 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec ... @@ -521,31 +530,32 @@ body: | ; GFX908-LABEL: name: copy_s64_to_v64_unaligned ; GFX908: liveins: $sgpr8_sgpr9 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_s64_to_v64_unaligned ; GFX90A: liveins: $sgpr8_sgpr9 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_s64_to_v64_unaligned ; GFX942: liveins: $sgpr8_sgpr9 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_s64_to_v64_unaligned ; GFX10: liveins: $sgpr8_sgpr9 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_s64_to_v64_unaligned ; GFX1250: liveins: $sgpr8_sgpr9 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec, implicit $exec $vgpr1_vgpr2 = COPY killed $sgpr8_sgpr9, implicit $exec ... @@ -558,41 +568,42 @@ body: | ; GFX908-LABEL: name: copy_s128_to_v128_unaligned ; GFX908: liveins: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr10, implicit $exec + ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_s128_to_v128_unaligned ; GFX90A: liveins: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec + ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr10, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_s128_to_v128_unaligned ; GFX942: liveins: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec + ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr10, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_s128_to_v128_unaligned ; GFX10: liveins: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX10-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec + ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr10, implicit $exec + ; GFX10-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_s128_to_v128_unaligned ; GFX1250: liveins: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX1250-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec + ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr10, implicit $exec + ; GFX1250-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $exec $vgpr1_vgpr2_vgpr3_vgpr4 = COPY killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec ... @@ -605,36 +616,37 @@ body: | ; GFX908-LABEL: name: copy_v96_to_v96_unaligned ; GFX908: liveins: $vgpr8_vgpr9_vgpr10 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 - ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v96_to_v96_unaligned ; GFX90A: liveins: $vgpr8_vgpr9_vgpr10 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 - ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v96_to_v96_unaligned ; GFX942: liveins: $vgpr8_vgpr9_vgpr10 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 - ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v96_to_v96_unaligned ; GFX10: liveins: $vgpr8_vgpr9_vgpr10 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 - ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v96_to_v96_unaligned ; GFX1250: liveins: $vgpr8_vgpr9_vgpr10 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 - ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec $vgpr1_vgpr2_vgpr3 = COPY killed $vgpr8_vgpr9_vgpr10, implicit $exec ... @@ -647,36 +659,37 @@ body: | ; GFX908-LABEL: name: copy_v96_unaligned_to_v96 ; GFX908: liveins: $vgpr7_vgpr8_vgpr9 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v96_unaligned_to_v96 ; GFX90A: liveins: $vgpr7_vgpr8_vgpr9 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9 - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v96_unaligned_to_v96 ; GFX942: liveins: $vgpr7_vgpr8_vgpr9 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9 - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v96_unaligned_to_v96 ; GFX10: liveins: $vgpr7_vgpr8_vgpr9 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v96_unaligned_to_v96 ; GFX1250: liveins: $vgpr7_vgpr8_vgpr9 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9 - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec, implicit $exec $vgpr0_vgpr1_vgpr2 = COPY killed $vgpr7_vgpr8_vgpr9, implicit $exec ... @@ -689,36 +702,37 @@ body: | ; GFX908-LABEL: name: copy_s96_to_v96 ; GFX908: liveins: $sgpr0_sgpr1_sgpr2 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_s96_to_v96 ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2 - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_s96_to_v96 ; GFX942: liveins: $sgpr0_sgpr1_sgpr2 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2 - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_s96_to_v96 ; GFX10: liveins: $sgpr0_sgpr1_sgpr2 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_s96_to_v96 ; GFX1250: liveins: $sgpr0_sgpr1_sgpr2 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2 - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0_vgpr1_vgpr2 = COPY killed $sgpr0_sgpr1_sgpr2, implicit $exec ... @@ -731,35 +745,36 @@ body: | ; GFX908-LABEL: name: copy_s96_to_v96_unaligned ; GFX908: liveins: $sgpr0_sgpr1_sgpr2 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_s96_to_v96_unaligned ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_s96_to_v96_unaligned ; GFX942: liveins: $sgpr0_sgpr1_sgpr2 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_s96_to_v96_unaligned ; GFX10: liveins: $sgpr0_sgpr1_sgpr2 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_s96_to_v96_unaligned ; GFX1250: liveins: $sgpr0_sgpr1_sgpr2 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec $vgpr1_vgpr2_vgpr3 = COPY killed $sgpr0_sgpr1_sgpr2, implicit $exec ... diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index f6cd3d131a627..078dff49f01cf 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -661,8 +661,8 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[0:1] ; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 07e6a76d14cf9..237a0f9dbaccc 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -106,10 +106,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -537,8 +537,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -563,11 +563,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_flbit_i32_b64 s4, s[2:3] ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone @@ -701,12 +701,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_readfirstlane_b32 s2, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -718,7 +719,6 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_cselect_b32 s2, s3, 32 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm @@ -816,16 +816,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] @@ -951,33 +951,33 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: s_add_u32 s4, s2, 5 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_add_u32 s4, s2, 4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_add_u32 s4, s2, 7 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_add_u32 s4, s2, 6 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: v_mov_b32_e32 v6, s4 ; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: flat_load_ubyte v10, v[0:1] ; VI-NEXT: flat_load_ubyte v11, v[2:3] ; VI-NEXT: flat_load_ubyte v12, v[4:5] ; VI-NEXT: flat_load_ubyte v6, v[6:7] ; VI-NEXT: flat_load_ubyte v7, v[8:9] -; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_add_u32 s4, s2, 1 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 @@ -1009,8 +1009,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_min_u32_e32 v0, v0, v4 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_min_u32_e32 v0, 64, v0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1119,11 +1119,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; VI-NEXT: v_ffbh_u32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1201,8 +1201,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b64 s0, s[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1228,11 +1228,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_flbit_i32_b64 s4, s[0:1] ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) @@ -1505,10 +1505,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1584,10 +1584,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1661,10 +1661,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2220,12 +2220,12 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; VI-NEXT: s_lshl_b32 s2, s2, 14 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_flbit_i32_b32 s2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 2 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_bfe_u32 s2, s2, 0x20010 +; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll index 37f5889918c41..77ae8b021417d 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll @@ -351,9 +351,9 @@ define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace( ; SI-NEXT: s_bcnt1_i32_b64 s4, s[6:7] ; SI-NEXT: s_mov_b32 s5, 0 ; SI-NEXT: .LBB7_3: ; %endif -; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -377,10 +377,10 @@ define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: s_bcnt1_i32_b64 s4, s[6:7] ; VI-NEXT: s_mov_b32 s5, 0 ; VI-NEXT: .LBB7_3: ; %endif -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll index d17cdeb8917ff..b95eff1b9feca 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -572,8 +572,8 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 137acd34ecc2a..62c0d3820ab53 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -93,10 +93,10 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -514,8 +514,8 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ff1_i32_b64 s2, s[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -540,11 +540,11 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_ff1_i32_b64 s4, s[2:3] ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone @@ -672,8 +672,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v2, v[2:3] @@ -781,16 +781,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] @@ -916,33 +916,33 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: s_add_u32 s4, s2, 5 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_add_u32 s4, s2, 4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_add_u32 s4, s2, 7 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_add_u32 s4, s2, 6 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: v_mov_b32_e32 v6, s4 ; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: flat_load_ubyte v10, v[0:1] ; VI-NEXT: flat_load_ubyte v11, v[2:3] ; VI-NEXT: flat_load_ubyte v12, v[4:5] ; VI-NEXT: flat_load_ubyte v6, v[6:7] ; VI-NEXT: flat_load_ubyte v7, v[8:9] -; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_add_u32 s4, s2, 1 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 @@ -974,8 +974,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: v_ffbl_b32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v0, v4, v0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_min_u32_e32 v0, 64, v0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1094,16 +1094,16 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] @@ -1216,16 +1216,16 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] @@ -1341,16 +1341,16 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] @@ -1456,10 +1456,10 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1541,8 +1541,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v2, v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence.ll index 39af6a05d2725..ad75d1a814955 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @private_load_maybe_divergent(ptr addrspace(4) %k, ptr ; GCN-LABEL: private_load_maybe_divergent: ; GCN: ; %bb.0: ; GCN-NEXT: s_add_i32 s12, s12, s17 -; GCN-NEXT: s_mov_b64 s[22:23], s[2:3] ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b64 s[22:23], s[2:3] ; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_add_u32 s20, s20, s17 ; GCN-NEXT: s_addc_u32 s21, s21, 0 diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index dd9a013d37203..b1eecfedbd442 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -185,9 +185,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v15, v23, v25 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] ; GFX9-NEXT: v_and_b32_e32 v6, 1, v30 -; GFX9-NEXT: v_mov_b32_e32 v15, v7 ; GFX9-NEXT: v_or3_b32 v3, v3, 0, v13 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v15, v7 ; GFX9-NEXT: v_mov_b32_e32 v14, v6 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB0_3 @@ -1267,16 +1267,16 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc ; GFX9-G-NEXT: v_add_co_u32_e32 v24, vcc, -1, v18 +; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v19, vcc ; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v20 -; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v19, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v4, vcc ; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v0, v8, s[4:5] ; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v1, v9, s[4:5] -; GFX9-G-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v4, vcc -; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-G-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v5, vcc +; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-G-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-G-NEXT: v_mov_b32_e32 v3, s11 @@ -1309,14 +1309,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v22, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc -; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13] ; GFX9-G-NEXT: v_or_b32_e32 v0, v20, v22 ; GFX9-G-NEXT: v_or_b32_e32 v1, v21, v23 +; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13] ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v8 ; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v28 -; GFX9-G-NEXT: v_mov_b32_e32 v0, v8 ; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v0, v8 ; GFX9-G-NEXT: v_mov_b32_e32 v1, v9 ; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX9-G-NEXT: s_cbranch_execnz .LBB0_3 @@ -1375,8 +1375,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], 0x7f ; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr3_vgpr4 killed $exec -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v1 @@ -1660,11 +1661,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 @@ -1768,8 +1771,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(18) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(19) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(18) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v5 @@ -1813,11 +1817,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v15, v2, v3 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v27 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v28 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v29 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v30 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2 @@ -2197,11 +2203,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v8 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v9 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v10 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v11 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2 @@ -2382,9 +2390,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 ; GFX9-NEXT: v_and_b32_e32 v12, 1, v26 -; GFX9-NEXT: v_mov_b32_e32 v17, v13 ; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v17, v13 ; GFX9-NEXT: v_mov_b32_e32 v16, v12 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB1_3 @@ -2431,8 +2439,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -3306,17 +3314,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v11, v9, v11 ; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], v14, v[0:1] ; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v15, 0, v13, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GFX9-G-NEXT: v_mov_b32_e32 v13, s11 ; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v8, v2, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v9, v3, vcc +; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v11, s9 ; GFX9-G-NEXT: v_mov_b32_e32 v10, s8 +; GFX9-G-NEXT: v_mov_b32_e32 v13, s11 ; GFX9-G-NEXT: v_mov_b32_e32 v12, s10 ; GFX9-G-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX9-G-NEXT: s_xor_b64 s[12:13], exec, s[8:9] @@ -3331,23 +3339,23 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v12 ; GFX9-G-NEXT: v_or_b32_e32 v11, v11, v13 ; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-G-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc ; GFX9-G-NEXT: v_add_co_u32_e32 v22, vcc, -1, v4 ; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v5, vcc -; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 ; GFX9-G-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v6, vcc -; GFX9-G-NEXT: v_mov_b32_e32 v13, s11 ; GFX9-G-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; GFX9-G-NEXT: v_cndmask_b32_e64 v3, v3, v1, s[4:5] ; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v7, vcc +; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-G-NEXT: v_mov_b32_e32 v11, s9 ; GFX9-G-NEXT: v_mov_b32_e32 v10, s8 +; GFX9-G-NEXT: v_mov_b32_e32 v13, s11 ; GFX9-G-NEXT: v_mov_b32_e32 v12, s10 ; GFX9-G-NEXT: .LBB1_3: ; %udiv-do-while ; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3384,8 +3392,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v0 ; GFX9-G-NEXT: v_and_b32_e32 v0, 1, v12 -; GFX9-G-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-G-NEXT: v_mov_b32_e32 v10, v0 ; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX9-G-NEXT: s_cbranch_execnz .LBB1_3 @@ -3686,11 +3694,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 @@ -3794,8 +3804,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(18) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(19) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(18) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v5 @@ -3839,11 +3850,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v13, v2, v3 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v29 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v30 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index 9f1b55ea3b1ef..4e54ad43ff2a9 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -1333,28 +1333,28 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac ; CI-NEXT: s_mov_b32 s14, s10 ; CI-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_mov_b32 s12, s8 +; CI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s48, s48, s11 ; CI-NEXT: s_mov_b64 s[10:11], s[6:7] ; CI-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 ; CI-NEXT: s_load_dword s6, s[4:5], 0x2 ; CI-NEXT: s_addc_u32 s49, s49, 0 +; CI-NEXT: s_mov_b32 s12, s8 ; CI-NEXT: s_add_u32 s8, s4, 12 -; CI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CI-NEXT: s_mov_b32 s13, s9 +; CI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_i32_e32 v40, vcc, s6, v3 ; CI-NEXT: ds_read_b32 v41, v40 -; CI-NEXT: s_addc_u32 s9, s5, 0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: s_mov_b64 s[4:5], s[0:1] -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] -; CI-NEXT: s_mov_b64 s[0:1], s[48:49] +; CI-NEXT: s_mov_b32 s13, s9 +; CI-NEXT: s_addc_u32 s9, s5, 0 ; CI-NEXT: s_mov_b32 s17, void_func_void@abs32@hi ; CI-NEXT: s_mov_b32 s16, void_func_void@abs32@lo ; CI-NEXT: v_or_b32_e32 v31, v0, v2 +; CI-NEXT: s_mov_b64 s[4:5], s[0:1] +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_mov_b64 s[0:1], s[48:49] ; CI-NEXT: s_mov_b64 s[2:3], s[50:51] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_mov_b32 s39, 0xf000 @@ -1373,8 +1373,8 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0 ; GFX9-NEXT: s_mov_b32 s14, s10 ; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] @@ -1382,17 +1382,17 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_add_u32 s8, s4, 12 -; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s6 ; GFX9-NEXT: ds_read_b32 v42, v41 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_addc_u32 s9, s5, 0 ; GFX9-NEXT: s_mov_b32 s17, void_func_void@abs32@hi ; GFX9-NEXT: s_mov_b32 s16, void_func_void@abs32@lo ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: v_mov_b32_e32 v40, 0 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index 1684437eff580..313d138b7f64f 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -1107,8 +1107,8 @@ define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { ; CI-LABEL: store_misaligned64_constant_large_offsets: ; CI: ; %bb.0: ; CI-NEXT: s_mov_b64 s[0:1], 0x7b -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v2, 0 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_write_b64 v2, v[0:1] offset:16384 @@ -1118,8 +1118,8 @@ define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { ; GFX9-LABEL: store_misaligned64_constant_large_offsets: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b64 s[0:1], 0x7b -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:16384 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:32760 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll index 683887b0a55f3..e2bfb61783711 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll @@ -1024,10 +1024,10 @@ define void @ds_write2_b64_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GCN-NEXT: v_accvgpr_write_b32 a21, v31 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 ; GCN-NEXT: v_accvgpr_read_b32 v5, a5 ; GCN-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:10 offset1:24 ; GCN-NEXT: v_accvgpr_write_b32 a31, v21 ; Reload Reuse diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll index 5c91ee3f7e748..ffd4f91c0d265 100644 --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -4,9 +4,9 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5) { ; CHECK-LABEL: cannot_create_empty_or_backwards_segment: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_mov_b64 s[26:27], s[2:3] ; CHECK-NEXT: s_mov_b64 s[24:25], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-NEXT: s_mov_b64 s[26:27], s[2:3] ; CHECK-NEXT: s_add_u32 s24, s24, s17 ; CHECK-NEXT: s_addc_u32 s25, s25, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll index c69b0cce3d208..2af8f0ba81584 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -66,9 +66,9 @@ define amdgpu_kernel void @double4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_cmp_eq_u32 s2, 3 ; GCN-NEXT: s_cselect_b32 s2, 0x40100a3d, s3 ; GCN-NEXT: s_cselect_b32 s3, 0x70a3d70a, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm @@ -103,9 +103,9 @@ define amdgpu_kernel void @double5_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN-NEXT: s_cselect_b32 s2, 0x70a3d70a, s8 -; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm @@ -144,8 +144,8 @@ define amdgpu_kernel void @float2_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s2, 1 ; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[2:3] +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -166,9 +166,9 @@ define amdgpu_kernel void @double2_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_cmp_eq_u32 s2, 1 ; GCN-NEXT: s_cselect_b32 s2, s3, 0x3f847ae1 ; GCN-NEXT: s_cselect_b32 s3, s4, 0x47ae147b -; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm @@ -285,15 +285,15 @@ define amdgpu_kernel void @double8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GCN-NEXT: s_load_dword s18, s[4:5], 0x2c -; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: s_mov_b32 s15, 0x40200000 +; GCN-NEXT: s_mov_b32 s1, 0x3ff00000 +; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: s_mov_b32 s13, 0x401c0000 ; GCN-NEXT: s_mov_b32 s11, 0x40180000 ; GCN-NEXT: s_mov_b32 s9, 0x40140000 ; GCN-NEXT: s_mov_b32 s7, 0x40100000 ; GCN-NEXT: s_mov_b32 s5, 0x40080000 ; GCN-NEXT: s_mov_b32 s3, 2.0 -; GCN-NEXT: s_mov_b32 s1, 0x3ff00000 ; GCN-NEXT: s_mov_b32 s2, s0 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s6, s0 @@ -336,6 +336,7 @@ define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x24 ; GCN-NEXT: s_load_dword s16, s[4:5], 0x2c +; GCN-NEXT: s_mov_b32 s1, 0x3ff00000 ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: s_mov_b32 s13, 0x401c0000 ; GCN-NEXT: s_mov_b32 s11, 0x40180000 @@ -343,17 +344,15 @@ define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_mov_b32 s7, 0x40100000 ; GCN-NEXT: s_mov_b32 s5, 0x40080000 ; GCN-NEXT: s_mov_b32 s3, 2.0 -; GCN-NEXT: s_mov_b32 s1, 0x3ff00000 ; GCN-NEXT: s_mov_b32 s2, s0 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s6, s0 ; GCN-NEXT: s_mov_b32 s8, s0 ; GCN-NEXT: s_mov_b32 s10, s0 ; GCN-NEXT: s_mov_b32 s12, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v15, s15 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 m0, s16, 1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -367,7 +366,6 @@ define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v11, s11 ; GCN-NEXT: v_mov_b32_e32 v12, s12 ; GCN-NEXT: v_mov_b32_e32 v13, s13 -; GCN-NEXT: v_mov_b32_e32 v14, s14 ; GCN-NEXT: v_movrels_b32_e32 v16, v1 ; GCN-NEXT: v_movrels_b32_e32 v15, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s14 @@ -419,6 +417,7 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_mov_b32 s37, 0x3ff00000 ; GCN-NEXT: s_mov_b32 s36, 0 ; GCN-NEXT: s_mov_b32 s65, 0x402e0000 ; GCN-NEXT: s_mov_b32 s63, 0x402c0000 @@ -434,7 +433,6 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_mov_b32 s43, 0x40100000 ; GCN-NEXT: s_mov_b32 s41, 0x40080000 ; GCN-NEXT: s_mov_b32 s39, 2.0 -; GCN-NEXT: s_mov_b32 s37, 0x3ff00000 ; GCN-NEXT: s_mov_b32 s38, s36 ; GCN-NEXT: s_mov_b32 s40, s36 ; GCN-NEXT: s_mov_b32 s42, s36 @@ -451,7 +449,6 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_mov_b32 s64, s36 ; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NEXT: v_mov_b32_e32 v31, s67 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 m0, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v2, s38 @@ -482,7 +479,6 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v27, s63 ; GCN-NEXT: v_mov_b32_e32 v28, s64 ; GCN-NEXT: v_mov_b32_e32 v29, s65 -; GCN-NEXT: v_mov_b32_e32 v30, s66 ; GCN-NEXT: v_movrels_b32_e32 v32, v1 ; GCN-NEXT: v_movrels_b32_e32 v31, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -500,8 +496,9 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: s_mov_b32 s36, 0 ; GCN-NEXT: s_mov_b32 s67, 0x40300000 +; GCN-NEXT: s_mov_b32 s37, 0x3ff00000 +; GCN-NEXT: s_mov_b32 s36, 0 ; GCN-NEXT: s_mov_b32 s65, 0x402e0000 ; GCN-NEXT: s_mov_b32 s63, 0x402c0000 ; GCN-NEXT: s_mov_b32 s61, 0x402a0000 @@ -516,7 +513,6 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_mov_b32 s43, 0x40100000 ; GCN-NEXT: s_mov_b32 s41, 0x40080000 ; GCN-NEXT: s_mov_b32 s39, 2.0 -; GCN-NEXT: s_mov_b32 s37, 0x3ff00000 ; GCN-NEXT: s_mov_b32 s38, s36 ; GCN-NEXT: s_mov_b32 s40, s36 ; GCN-NEXT: s_mov_b32 s42, s36 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll index fac9f5bf826a6..16300185a4b7b 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll @@ -69,8 +69,8 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -123,8 +123,8 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -177,8 +177,8 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -274,8 +274,8 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -371,8 +371,8 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -427,8 +427,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out ; VI-NEXT: s_lshl_b32 s2, s2, 3 ; VI-NEXT: s_and_b32 s3, s3, 0xffff ; VI-NEXT: s_lshr_b32 s2, s3, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -468,8 +468,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s2, s2, 3 ; VI-NEXT: s_lshr_b32 s2, s3, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll index c46fcde739b1c..ba63da85e75e7 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll @@ -220,9 +220,9 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -246,9 +246,9 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in ; VI-NEXT: s_lshl_b32 s2, s2, 16 ; VI-NEXT: s_or_b32 s3, s4, s3 ; VI-NEXT: s_or_b32 s2, s5, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 27cf49aec8229..614612de3ee5b 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -220,9 +220,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -236,9 +236,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll index 97e23fcdb2263..233936988017f 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -116,9 +116,9 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset0_b32 s3, 31 ; VI-NEXT: s_bitset0_b32 s2, 31 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index 52bcaed7ec75a..5259d20664d3f 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -79,13 +79,13 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-ALIGNED-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-ALIGNED-NEXT: flat_store_short v[0:1], v2 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-ALIGNED-NEXT: flat_store_short v[0:1], v2 ; GFX7-ALIGNED-NEXT: s_endpgm @@ -228,23 +228,23 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-ALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-ALIGNED-NEXT: s_add_u32 s4, s0, 1 -; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2 ; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0 +; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3 -; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3 ; GFX7-ALIGNED-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2 ; GFX7-ALIGNED-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index d32b528d13276..932987b321042 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -613,9 +613,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 -; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 ; GFX678-NEXT: flat_store_dword v[0:1], v2 ; GFX678-NEXT: s_endpgm @@ -660,9 +660,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 -; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 ; GFX678-NEXT: flat_store_dword v[0:1], v2 ; GFX678-NEXT: s_endpgm @@ -707,9 +707,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 -; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 ; GFX678-NEXT: flat_store_dword v[0:1], v2 ; GFX678-NEXT: s_endpgm @@ -1464,9 +1464,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1512,9 +1512,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1558,9 +1558,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1602,9 +1602,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1646,9 +1646,9 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1690,9 +1690,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1738,9 +1738,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1785,9 +1785,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1831,9 +1831,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1878,9 +1878,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1922,9 +1922,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1966,9 +1966,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2010,9 +2010,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2054,9 +2054,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2098,9 +2098,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2142,9 +2142,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll index 8b5c34d97e50e..f5bc295ba6b85 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -34,9 +34,9 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_bfi_b32 v1, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -80,9 +80,9 @@ define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32 ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset0_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -124,9 +124,9 @@ define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32 ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset0_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -168,9 +168,9 @@ define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i3 ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset0_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -212,9 +212,9 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -256,9 +256,9 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -306,9 +306,9 @@ define amdgpu_kernel void @s_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_bfi_b32 v1, s4, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -359,9 +359,9 @@ define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_bfi_b32 v1, s4, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -634,10 +634,10 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou ; VI-NEXT: v_mov_b32_e32 v2, s9 ; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s13 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -704,13 +704,13 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou ; VI-NEXT: v_bfi_b32 v5, s2, v0, v2 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -787,14 +787,14 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou ; VI-NEXT: v_bfi_b32 v5, s2, v0, v2 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v9, s3 ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v6, s14 +; VI-NEXT: v_mov_b32_e32 v9, s3 ; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll index 9e527cf38e7ee..e044fbfcd488b 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -1724,11 +1724,11 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, 0.5, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1800,11 +1800,11 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, 0x2e66, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1876,11 +1876,11 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, 0xae66, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index c510c40c8536c..e618a723b7fc9 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -540,9 +540,9 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -601,9 +601,9 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -662,9 +662,9 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -886,9 +886,9 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1282,10 +1282,10 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_rcp_f32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -1358,10 +1358,10 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_rcp_f32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -2157,10 +2157,10 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX8-NEXT: v_fma_f32 v0, -v0, v3, v1 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3 -; GFX8-NEXT: v_div_fixup_f32 v2, v0, s2, 1.0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_div_fixup_f32 v2, v0, s2, 1.0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2307,10 +2307,10 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX8-NEXT: v_fma_f32 v3, v4, v2, v3 ; GFX8-NEXT: v_fma_f32 v0, -v0, v3, v1 ; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3 -; GFX8-NEXT: v_div_fixup_f32 v2, v0, s2, 1.0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_div_fixup_f32 v2, v0, s2, 1.0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll index 607ed85274e40..4a5fb7d4a511f 100644 --- a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll +++ b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll @@ -9,17 +9,17 @@ define amdgpu_kernel void @same_address_fence_merge_write2() #0 { ; GCN-LABEL: same_address_fence_merge_write2: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GCN-NEXT: s_mov_b32 s1, 0x40100000 +; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_add_u32_e32 v3, 0x800, v2 +; GCN-NEXT: s_mov_b32 s1, 0x3ff00000 ; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:66 ; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198 ; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset0:8 offset1:74 ; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset0:140 offset1:206 -; GCN-NEXT: s_mov_b32 s1, 0x3ff00000 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index b2e9831d6c84f..d0df75a4a4a4e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -22,8 +22,8 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX9-NEXT: s_mov_b32 s1, s0 ; GFX9-NEXT: s_mov_b32 s2, s0 ; GFX9-NEXT: s_mov_b32 s3, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -97,8 +97,8 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll index b35f07002a48a..9e6ac75a2ee3d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -5603,8 +5603,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5618,8 +5618,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5631,8 +5631,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5666,8 +5666,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5685,8 +5685,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5702,8 +5702,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5750,8 +5750,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5771,8 +5771,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5790,8 +5790,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: s_add_u32 s0, s2, s0 ; GCN3-NEXT: s_addc_u32 s1, s3, s1 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5839,8 +5839,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5863,8 +5863,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5885,8 +5885,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; GCN3-NEXT: v_mov_b32_e32 v0, s8 ; GCN3-NEXT: s_add_u32 s0, s0, s4 ; GCN3-NEXT: s_addc_u32 s1, s1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v1, s9 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5932,8 +5932,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5945,8 +5945,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5958,8 +5958,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5990,8 +5990,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6007,8 +6007,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6024,8 +6024,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6069,8 +6069,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_add_u32 s0, s2, s0 ; GCN1-NEXT: s_addc_u32 s1, s3, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6088,8 +6088,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_add_u32 s0, s2, s0 ; GCN2-NEXT: s_addc_u32 s1, s3, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6107,8 +6107,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: s_add_u32 s0, s2, s0 ; GCN3-NEXT: s_addc_u32 s1, s3, s1 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6153,8 +6153,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; GCN1-NEXT: v_mov_b32_e32 v0, s8 ; GCN1-NEXT: s_add_u32 s0, s0, s4 ; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6175,8 +6175,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; GCN2-NEXT: v_mov_b32_e32 v0, s8 ; GCN2-NEXT: s_add_u32 s0, s0, s4 ; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6197,8 +6197,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; GCN3-NEXT: v_mov_b32_e32 v0, s8 ; GCN3-NEXT: s_add_u32 s0, s0, s4 ; GCN3-NEXT: s_addc_u32 s1, s1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v1, s9 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8797,7 +8797,7 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 ; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, s2 ; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, s3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 ; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 offset:16 ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 278964334b711..e4b69e045d609 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -1497,8 +1497,8 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB36_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1522,8 +1522,8 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB36_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1547,8 +1547,8 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB36_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1626,8 +1626,8 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB37_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2159,8 +2159,8 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB46_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2184,8 +2184,8 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB46_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2209,8 +2209,8 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB46_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2288,8 +2288,8 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB47_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2839,8 +2839,8 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2865,8 +2865,8 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2891,8 +2891,8 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2973,8 +2973,8 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3592,8 +3592,8 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg % ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB66_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3617,8 +3617,8 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg % ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB66_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3642,8 +3642,8 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg % ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB66_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3721,8 +3721,8 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB67_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4254,8 +4254,8 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB76_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4279,8 +4279,8 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB76_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4304,8 +4304,8 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB76_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4383,8 +4383,8 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB77_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4916,8 +4916,8 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4941,8 +4941,8 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4966,8 +4966,8 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5045,8 +5045,8 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5980,8 +5980,8 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6005,8 +6005,8 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6030,8 +6030,8 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6109,8 +6109,8 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6954,8 +6954,8 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6979,8 +6979,8 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7004,8 +7004,8 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7083,8 +7083,8 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7616,8 +7616,8 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7641,8 +7641,8 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7666,8 +7666,8 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7745,8 +7745,8 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8703,8 +8703,8 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB137_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8730,8 +8730,8 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB137_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8757,8 +8757,8 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB137_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8842,8 +8842,8 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB138_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9262,9 +9262,9 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: .LBB145_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9291,9 +9291,9 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: .LBB145_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9320,9 +9320,9 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v1, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: .LBB145_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9411,9 +9411,9 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: .LBB146_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9445,9 +9445,9 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB147_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9474,9 +9474,9 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB147_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9503,9 +9503,9 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v3, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB147_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9594,9 +9594,9 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v3, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB148_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index 25fbdbc83b2b9..c17b82a036fc0 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -29,8 +29,8 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB0_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -79,8 +79,8 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB0_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -164,8 +164,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB1_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -216,8 +216,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB1_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -320,8 +320,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB2_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -374,8 +374,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB2_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -467,8 +467,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB3_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -521,8 +521,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB3_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -621,8 +621,8 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB4_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -669,8 +669,8 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB4_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -750,8 +750,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB5_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -800,8 +800,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB5_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -900,8 +900,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB6_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -952,8 +952,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB6_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1041,8 +1041,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB7_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1093,8 +1093,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB7_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1193,8 +1193,8 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB8_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1242,8 +1242,8 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB8_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1325,8 +1325,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB9_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1376,8 +1376,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB9_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1478,8 +1478,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB10_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1531,8 +1531,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB10_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1622,8 +1622,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB11_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1675,8 +1675,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB11_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1773,8 +1773,8 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB12_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1820,8 +1820,8 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB12_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1899,8 +1899,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB13_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1948,8 +1948,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB13_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2046,8 +2046,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB14_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2097,8 +2097,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB14_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2184,8 +2184,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB15_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2235,8 +2235,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB15_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2333,8 +2333,8 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB16_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2383,8 +2383,8 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB16_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2468,8 +2468,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB17_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2520,8 +2520,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB17_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2624,8 +2624,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB18_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2678,8 +2678,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB18_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2771,8 +2771,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB19_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2825,8 +2825,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB19_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2925,8 +2925,8 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB20_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2973,8 +2973,8 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB20_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3054,8 +3054,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB21_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3104,8 +3104,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB21_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3204,8 +3204,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB22_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3256,8 +3256,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB22_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3345,8 +3345,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB23_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3397,8 +3397,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB23_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3497,8 +3497,8 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB24_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -3547,8 +3547,8 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB24_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -3632,8 +3632,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB25_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -3685,8 +3685,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB25_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -3790,8 +3790,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB26_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -3844,8 +3844,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB26_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -3937,8 +3937,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB27_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -3992,8 +3992,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB27_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -4093,8 +4093,8 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB28_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -4141,8 +4141,8 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB28_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -4222,8 +4222,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB29_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -4273,8 +4273,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB29_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -4374,8 +4374,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB30_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -4426,8 +4426,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB30_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -4515,8 +4515,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB31_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -4568,8 +4568,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB31_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -4669,8 +4669,8 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB32_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -4719,8 +4719,8 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB32_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -4804,8 +4804,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_cbranch_vccz .LBB33_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -4857,8 +4857,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_cbranch_vccz .LBB33_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -4962,8 +4962,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB34_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -5016,8 +5016,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB34_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -5109,8 +5109,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_cbranch_vccz .LBB35_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -5164,8 +5164,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_cbranch_vccz .LBB35_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -5265,8 +5265,8 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB36_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -5313,8 +5313,8 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB36_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -5394,8 +5394,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB37_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -5445,8 +5445,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB37_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -5546,8 +5546,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB38_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -5598,8 +5598,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB38_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -5687,8 +5687,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_cbranch_vccz .LBB39_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -5740,8 +5740,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_cbranch_vccz .LBB39_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -5841,8 +5841,8 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB40_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -5891,8 +5891,8 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB40_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -5976,8 +5976,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB41_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -6029,8 +6029,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB41_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -6134,8 +6134,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB42_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -6188,8 +6188,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB42_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -6281,8 +6281,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB43_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -6336,8 +6336,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB43_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -6437,8 +6437,8 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB44_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -6485,8 +6485,8 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB44_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -6566,8 +6566,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB45_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -6617,8 +6617,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB45_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -6718,8 +6718,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB46_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -6770,8 +6770,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB46_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -6859,8 +6859,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB47_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -6912,8 +6912,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB47_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -7013,8 +7013,8 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB48_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -7063,8 +7063,8 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB48_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -7148,8 +7148,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_cbranch_vccz .LBB49_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -7201,8 +7201,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_cbranch_vccz .LBB49_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -7306,8 +7306,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB50_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -7360,8 +7360,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB50_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -7453,8 +7453,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_cbranch_vccz .LBB51_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -7508,8 +7508,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_cbranch_vccz .LBB51_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -7609,8 +7609,8 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB52_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -7657,8 +7657,8 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB52_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -7738,8 +7738,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB53_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -7789,8 +7789,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB53_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -7890,8 +7890,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB54_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -7942,8 +7942,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB54_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -8031,8 +8031,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_cbranch_vccz .LBB55_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -8084,8 +8084,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_cbranch_vccz .LBB55_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -8185,8 +8185,8 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB56_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8234,8 +8234,8 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB56_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8317,8 +8317,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GCN1-NEXT: s_cbranch_vccz .LBB57_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8368,8 +8368,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GCN2-NEXT: s_cbranch_vccz .LBB57_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8470,8 +8470,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB58_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8523,8 +8523,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB58_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8614,8 +8614,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB59_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8667,8 +8667,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB59_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8765,8 +8765,8 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB60_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8812,8 +8812,8 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB60_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8891,8 +8891,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB61_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8940,8 +8940,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB61_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9038,8 +9038,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB62_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9089,8 +9089,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB62_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9176,8 +9176,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GCN1-NEXT: s_cbranch_vccz .LBB63_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9227,8 +9227,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GCN2-NEXT: s_cbranch_vccz .LBB63_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9325,8 +9325,8 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB64_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9370,8 +9370,8 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB64_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9451,8 +9451,8 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB65_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9496,8 +9496,8 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB65_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9577,8 +9577,8 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB66_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9622,8 +9622,8 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB66_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9698,8 +9698,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_cbranch_vccz .LBB67_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9748,8 +9748,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_cbranch_vccz .LBB67_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9848,8 +9848,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB68_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9897,8 +9897,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB68_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9981,8 +9981,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_cbranch_vccz .LBB69_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10033,8 +10033,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_cbranch_vccz .LBB69_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10129,8 +10129,8 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB70_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10172,8 +10172,8 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB70_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10244,8 +10244,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB71_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10292,8 +10292,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB71_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10388,8 +10388,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB72_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10435,8 +10435,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB72_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10515,8 +10515,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_cbranch_vccz .LBB73_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10565,8 +10565,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_cbranch_vccz .LBB73_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10661,8 +10661,8 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB74_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10710,8 +10710,8 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB74_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10793,8 +10793,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB75_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10844,8 +10844,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB75_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10946,8 +10946,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB76_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10999,8 +10999,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB76_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11090,8 +11090,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB77_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11143,8 +11143,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB77_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11241,8 +11241,8 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB78_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11288,8 +11288,8 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB78_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11367,8 +11367,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB79_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11416,8 +11416,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB79_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11514,8 +11514,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB80_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11565,8 +11565,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB80_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11652,8 +11652,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB81_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11703,8 +11703,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB81_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12213,11 +12213,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GCN1-NEXT: .LBB90_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB90_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12267,11 +12267,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GCN2-NEXT: .LBB90_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB90_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12364,11 +12364,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GCN1-NEXT: .LBB91_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB91_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12418,11 +12418,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GCN2-NEXT: .LBB91_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB91_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12508,11 +12508,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB92_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s13 ; GCN1-NEXT: v_mov_b32_e32 v2, s14 ; GCN1-NEXT: v_mov_b32_e32 v3, s15 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12562,11 +12562,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB92_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s13 ; GCN2-NEXT: v_mov_b32_e32 v2, s14 ; GCN2-NEXT: v_mov_b32_e32 v3, s15 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12668,11 +12668,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GCN1-NEXT: .LBB93_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB93_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s10 ; GCN1-NEXT: v_mov_b32_e32 v1, s11 ; GCN1-NEXT: v_mov_b32_e32 v2, s14 ; GCN1-NEXT: v_mov_b32_e32 v3, s15 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12724,11 +12724,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GCN2-NEXT: .LBB93_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB93_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s10 ; GCN2-NEXT: v_mov_b32_e32 v1, s11 ; GCN2-NEXT: v_mov_b32_e32 v2, s14 ; GCN2-NEXT: v_mov_b32_e32 v3, s15 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12820,11 +12820,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB94_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s3 ; GCN1-NEXT: v_mov_b32_e32 v0, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s13 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v5, s3 ; GCN1-NEXT: v_mov_b32_e32 v4, s2 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12878,11 +12878,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB94_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s3 ; GCN2-NEXT: v_mov_b32_e32 v0, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s13 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v5, s3 ; GCN2-NEXT: v_mov_b32_e32 v4, s2 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12986,11 +12986,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GCN1-NEXT: .LBB95_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB95_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13038,11 +13038,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GCN2-NEXT: .LBB95_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB95_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13124,11 +13124,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN1-NEXT: s_cbranch_vccz .LBB96_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v4, s8 ; GCN1-NEXT: v_mov_b32_e32 v0, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s13 ; GCN1-NEXT: v_mov_b32_e32 v2, s14 ; GCN1-NEXT: v_mov_b32_e32 v3, s15 +; GCN1-NEXT: v_mov_b32_e32 v4, s8 ; GCN1-NEXT: v_mov_b32_e32 v5, s9 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13176,11 +13176,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN2-NEXT: s_cbranch_vccz .LBB96_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v4, s8 ; GCN2-NEXT: v_mov_b32_e32 v0, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s13 ; GCN2-NEXT: v_mov_b32_e32 v2, s14 ; GCN2-NEXT: v_mov_b32_e32 v3, s15 +; GCN2-NEXT: v_mov_b32_e32 v4, s8 ; GCN2-NEXT: v_mov_b32_e32 v5, s9 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13278,11 +13278,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GCN1-NEXT: .LBB97_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB97_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s10 ; GCN1-NEXT: v_mov_b32_e32 v1, s11 ; GCN1-NEXT: v_mov_b32_e32 v2, s14 ; GCN1-NEXT: v_mov_b32_e32 v3, s15 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13332,11 +13332,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GCN2-NEXT: .LBB97_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB97_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s10 ; GCN2-NEXT: v_mov_b32_e32 v1, s11 ; GCN2-NEXT: v_mov_b32_e32 v2, s14 ; GCN2-NEXT: v_mov_b32_e32 v3, s15 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13424,11 +13424,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB98_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s3 ; GCN1-NEXT: v_mov_b32_e32 v0, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s13 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v5, s3 ; GCN1-NEXT: v_mov_b32_e32 v4, s2 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13480,11 +13480,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB98_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s3 ; GCN2-NEXT: v_mov_b32_e32 v0, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s13 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v5, s3 ; GCN2-NEXT: v_mov_b32_e32 v4, s2 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14000,8 +14000,8 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB107_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14052,8 +14052,8 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB107_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14142,8 +14142,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB108_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14196,8 +14196,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB108_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14305,8 +14305,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB109_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14361,8 +14361,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB109_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14459,8 +14459,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB110_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14515,8 +14515,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB110_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14620,8 +14620,8 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB111_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14670,8 +14670,8 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB111_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14756,8 +14756,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB112_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14808,8 +14808,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB112_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14913,8 +14913,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB113_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14967,8 +14967,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB113_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15061,8 +15061,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB114_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15115,8 +15115,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB114_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15220,8 +15220,8 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB115_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15275,8 +15275,8 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB115_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15371,8 +15371,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB116_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15429,8 +15429,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB116_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15545,8 +15545,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB117_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15604,8 +15604,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB117_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15708,8 +15708,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB118_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15768,8 +15768,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB118_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15880,8 +15880,8 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB119_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15933,8 +15933,8 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB119_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16025,8 +16025,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB120_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s8 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16081,8 +16081,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB120_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s8 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16193,8 +16193,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB121_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16250,8 +16250,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB121_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16350,8 +16350,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB122_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16408,8 +16408,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB122_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll index e5187a811a230..096f20b91cede 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s + define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_add_i64_offset: ; GFX7: ; %bb.0: ; %entry @@ -10,9 +11,9 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -25,9 +26,9 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -38,14 +39,15 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -59,8 +61,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -78,8 +80,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -92,19 +94,21 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_add_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -158,15 +162,16 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -221,17 +226,18 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -385,7 +391,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -445,7 +451,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -510,14 +516,15 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -590,19 +597,21 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_and_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -680,15 +689,16 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -769,17 +779,18 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1015,7 +1026,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -1101,7 +1112,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1168,14 +1179,15 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -1196,8 +1208,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7 ; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc @@ -1229,8 +1241,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v3 ; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, v3 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc @@ -1250,19 +1262,21 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_sub_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1342,15 +1356,16 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -1373,8 +1388,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7 ; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc @@ -1408,8 +1423,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v3 ; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, v3 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc @@ -1433,17 +1448,18 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1458,8 +1474,8 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1486,8 +1502,8 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1531,14 +1547,14 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v1 ; GFX7-NEXT: v_mov_b32_e32 v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, v1 ; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7 ; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -1564,14 +1580,14 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -1685,7 +1701,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -1706,8 +1722,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7 ; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc @@ -1739,8 +1755,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v3 ; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, v3 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc @@ -1773,7 +1789,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1842,14 +1858,15 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -1926,19 +1943,21 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_max_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2020,15 +2039,16 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -2113,17 +2133,18 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2138,9 +2159,9 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2167,9 +2188,9 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2213,9 +2234,9 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2247,9 +2268,9 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2371,7 +2392,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -2461,7 +2482,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2530,14 +2551,15 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -2614,19 +2636,21 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umax_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2708,15 +2732,16 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -2801,17 +2826,18 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2826,9 +2852,9 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2855,9 +2881,9 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2901,9 +2927,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2935,9 +2961,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3059,7 +3085,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -3149,7 +3175,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -3218,14 +3244,15 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -3302,19 +3329,21 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_min_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -3396,15 +3425,16 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -3489,17 +3519,18 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -3514,9 +3545,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3543,9 +3574,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3589,9 +3620,9 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3623,9 +3654,9 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3747,7 +3778,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -3837,7 +3868,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -3906,14 +3937,15 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -3990,19 +4022,21 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umin_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -4084,15 +4118,16 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -4177,17 +4212,18 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -4202,9 +4238,9 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4231,9 +4267,9 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4277,9 +4313,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4311,9 +4347,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4435,7 +4471,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -4525,7 +4561,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -4590,14 +4626,15 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -4670,19 +4707,21 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-LABEL: atomic_or_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -4760,15 +4799,16 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -4849,17 +4889,18 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -5095,7 +5136,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5181,7 +5222,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -5194,9 +5235,9 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5209,9 +5250,9 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5222,14 +5263,15 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5241,9 +5283,9 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5256,9 +5298,9 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5269,14 +5311,15 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds double, ptr %out, i64 4 + %gep = getelementptr double, ptr %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, double %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5288,9 +5331,9 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5303,9 +5346,9 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5316,14 +5359,15 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds ptr, ptr %out, i32 4 + %gep = getelementptr ptr, ptr %out, i32 4 %val = atomicrmw volatile xchg ptr %gep, ptr %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5337,8 +5381,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5356,8 +5400,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5370,19 +5414,21 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_xchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -5436,15 +5482,16 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5499,17 +5546,18 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -5663,7 +5711,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5723,7 +5771,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -5788,14 +5836,15 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5868,19 +5917,21 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_xor_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -5958,15 +6009,16 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -6047,17 +6099,18 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -6293,7 +6346,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -6379,7 +6432,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -6422,15 +6475,16 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %in, i64 4 + %gep = getelementptr i64, ptr %in, i64 4 %val = load atomic i64, ptr %gep seq_cst, align 8 store i64 %val, ptr %out ret void @@ -6532,16 +6586,17 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %in, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %in, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %val = load atomic i64, ptr %gep seq_cst, align 8 store i64 %val, ptr %out ret void @@ -6601,7 +6656,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %in, i64 %index + %ptr = getelementptr i64, ptr %in, i64 %index %val = load atomic i64, ptr %ptr seq_cst, align 8 store i64 %val, ptr %out ret void @@ -6638,14 +6693,15 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 32 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 store atomic i64 %in, ptr %gep seq_cst, align 8 ret void } @@ -6732,15 +6788,16 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 32 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 store atomic i64 %in, ptr %gep seq_cst, align 8 ret void } @@ -6792,7 +6849,7 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index store atomic i64 %in, ptr %ptr seq_cst, align 8 ret void } @@ -6805,11 +6862,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6823,11 +6880,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6840,15 +6897,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -6861,11 +6919,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 0x11940 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6879,11 +6937,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 0x11940 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6896,15 +6954,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x11940 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000 scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 9000 + %gep = getelementptr i64, ptr %out, i64 9000 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -6916,11 +6975,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6936,11 +6995,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6954,17 +7013,18 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 @@ -6981,11 +7041,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7001,11 +7061,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7020,14 +7080,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -7044,10 +7106,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX7-NEXT: s_addc_u32 s3, s9, s3 ; GFX7-NEXT: s_add_u32 s2, s0, 32 ; GFX7-NEXT: s_addc_u32 s3, s3, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-NEXT: v_mov_b32_e32 v0, s12 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7068,10 +7130,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX8-NEXT: s_addc_u32 s3, s9, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: s_addc_u32 s3, s3, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7091,16 +7153,18 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[8:9], s[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 @@ -7223,11 +7287,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7241,11 +7305,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7266,7 +7330,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -7280,11 +7344,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX7-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 ; GFX7-NEXT: s_add_u32 s2, s8, s2 ; GFX7-NEXT: s_addc_u32 s3, s9, s3 -; GFX7-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-NEXT: v_mov_b32_e32 v0, s12 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7302,11 +7366,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 ; GFX8-NEXT: s_add_u32 s2, s8, s2 ; GFX8-NEXT: s_addc_u32 s3, s9, s3 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7334,7 +7398,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 @@ -7378,15 +7442,16 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds double, ptr %in, i64 4 + %gep = getelementptr double, ptr %in, i64 4 %val = load atomic double, ptr %gep seq_cst, align 8, !noalias.addrspace !0 store double %val, ptr %out ret void @@ -7488,16 +7553,17 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds double, ptr %in, i64 %index - %gep = getelementptr inbounds double, ptr %ptr, i64 4 + %ptr = getelementptr double, ptr %in, i64 %index + %gep = getelementptr double, ptr %ptr, i64 4 %val = load atomic double, ptr %gep seq_cst, align 8, !noalias.addrspace !0 store double %val, ptr %out ret void @@ -7557,7 +7623,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds double, ptr %in, i64 %index + %ptr = getelementptr double, ptr %in, i64 %index %val = load atomic double, ptr %ptr seq_cst, align 8, !noalias.addrspace !0 store double %val, ptr %out ret void @@ -7594,14 +7660,15 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 32 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds double, ptr %out, i64 4 + %gep = getelementptr double, ptr %out, i64 4 store atomic double %in, ptr %gep seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -7688,15 +7755,16 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 32 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds double, ptr %out, i64 %index - %gep = getelementptr inbounds double, ptr %ptr, i64 4 + %ptr = getelementptr double, ptr %out, i64 %index + %gep = getelementptr double, ptr %ptr, i64 4 store atomic double %in, ptr %gep seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -7748,7 +7816,7 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds double, ptr %out, i64 %index + %ptr = getelementptr double, ptr %out, i64 %index store atomic double %in, ptr %ptr seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -7818,14 +7886,15 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -7845,8 +7914,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: .LBB108_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] @@ -7880,8 +7949,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: .LBB108_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] @@ -7904,19 +7973,21 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_inc_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -8000,15 +8071,16 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -8030,8 +8102,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: .LBB110_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] @@ -8067,8 +8139,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: .LBB110_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] @@ -8095,17 +8167,18 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -8200,8 +8273,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] @@ -8233,8 +8306,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] @@ -8355,7 +8428,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -8375,8 +8448,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] @@ -8410,8 +8483,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] @@ -8447,7 +8520,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -8526,14 +8599,15 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -8620,19 +8694,21 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_dec_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -8724,15 +8800,16 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -8827,17 +8904,18 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -8852,9 +8930,9 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB119_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8886,9 +8964,9 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB119_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9111,7 +9189,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -9211,7 +9289,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 9e27f6badfdac..e39fd817a0bf5 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN2 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN3 %s + ; --------------------------------------------------------------------- ; atomicrmw xchg ; --------------------------------------------------------------------- @@ -478,8 +479,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB4_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -517,8 +518,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB4_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -553,8 +554,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB4_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -595,8 +596,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB5_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -636,8 +637,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB5_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -674,8 +675,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB5_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -709,8 +710,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: s_cbranch_vccz .LBB6_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -748,8 +749,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: s_cbranch_vccz .LBB6_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -784,8 +785,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_cbranch_vccz .LBB6_2 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -826,8 +827,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: s_cbranch_vccz .LBB7_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -867,8 +868,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: s_cbranch_vccz .LBB7_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -905,8 +906,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: s_cbranch_vccz .LBB7_2 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1638,8 +1639,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB14_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1677,8 +1678,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB14_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1713,8 +1714,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB14_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1755,8 +1756,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB15_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1796,8 +1797,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB15_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1834,8 +1835,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB15_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1869,8 +1870,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double ; GCN1-NEXT: s_cbranch_vccz .LBB16_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1908,8 +1909,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double ; GCN2-NEXT: s_cbranch_vccz .LBB16_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1944,8 +1945,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double ; GCN3-NEXT: s_cbranch_vccz .LBB16_2 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1986,8 +1987,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_cbranch_vccz .LBB17_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2027,8 +2028,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_cbranch_vccz .LBB17_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2065,8 +2066,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_cbranch_vccz .LBB17_2 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2856,8 +2857,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB24_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2900,8 +2901,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB24_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2941,8 +2942,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB24_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2988,8 +2989,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB25_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3034,8 +3035,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB25_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3077,8 +3078,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB25_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3117,8 +3118,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: s_cbranch_vccz .LBB26_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3159,8 +3160,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: s_cbranch_vccz .LBB26_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3198,8 +3199,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_cbranch_vccz .LBB26_2 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3243,8 +3244,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_cbranch_vccz .LBB27_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3287,8 +3288,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_cbranch_vccz .LBB27_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3328,8 +3329,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_cbranch_vccz .LBB27_2 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4000,8 +4001,8 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: .LBB32_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 ; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -4058,8 +4059,8 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: .LBB32_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v5 ; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_mov_b32_e32 v7, v5 ; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 ; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -4119,8 +4120,8 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB32_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_sub_co_u32_e32 v6, vcc, v8, v2 ; GCN3-NEXT: v_subb_co_u32_e32 v7, vcc, v9, v3, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc @@ -4184,8 +4185,8 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: .LBB33_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 ; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_mov_b32_e32 v9, v1 ; GCN1-NEXT: v_sub_i32_e32 v6, vcc, v8, v2 ; GCN1-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc @@ -4246,8 +4247,8 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: .LBB33_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 ; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_mov_b32_e32 v9, v1 ; GCN2-NEXT: v_sub_u32_e32 v6, vcc, v8, v2 ; GCN2-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc @@ -4303,8 +4304,8 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB33_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_sub_co_u32_e32 v6, vcc, v8, v2 ; GCN3-NEXT: v_subb_co_u32_e32 v7, vcc, v9, v3, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc @@ -4736,8 +4737,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: .LBB36_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v8, v1 ; GCN1-NEXT: v_mov_b32_e32 v7, v0 +; GCN1-NEXT: v_mov_b32_e32 v8, v1 ; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 ; GCN1-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -4797,8 +4798,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: .LBB36_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v8, v1 ; GCN2-NEXT: v_mov_b32_e32 v7, v0 +; GCN2-NEXT: v_mov_b32_e32 v8, v1 ; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 ; GCN2-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -4850,8 +4851,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: .LBB36_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v8, v1 ; GCN3-NEXT: v_mov_b32_e32 v7, v0 +; GCN3-NEXT: v_mov_b32_e32 v8, v1 ; GCN3-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 ; GCN3-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -4914,8 +4915,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: .LBB37_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v8, v1 ; GCN1-NEXT: v_mov_b32_e32 v7, v0 +; GCN1-NEXT: v_mov_b32_e32 v8, v1 ; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 ; GCN1-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -4977,8 +4978,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: .LBB37_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v8, v1 ; GCN2-NEXT: v_mov_b32_e32 v7, v0 +; GCN2-NEXT: v_mov_b32_e32 v8, v1 ; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 ; GCN2-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -5032,8 +5033,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: .LBB37_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v8, v1 ; GCN3-NEXT: v_mov_b32_e32 v7, v0 +; GCN3-NEXT: v_mov_b32_e32 v8, v1 ; GCN3-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 ; GCN3-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -22039,8 +22040,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: .LBB133_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v6 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] @@ -22102,8 +22103,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: .LBB133_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v5 ; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_mov_b32_e32 v7, v5 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v6 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] @@ -22168,8 +22169,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB133_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v8 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v9, vcc ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] @@ -22238,8 +22239,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: .LBB134_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 ; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_mov_b32_e32 v9, v1 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v8 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] @@ -22305,8 +22306,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: .LBB134_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 ; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_mov_b32_e32 v9, v1 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v8 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] @@ -22367,8 +22368,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB134_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v8 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v9, vcc ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] @@ -22828,8 +22829,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN1-NEXT: .LBB137_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v6 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -22893,8 +22894,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN2-NEXT: .LBB137_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v6 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -22950,8 +22951,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: .LBB137_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v1 ; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_mov_b32_e32 v7, v1 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -23018,8 +23019,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: .LBB138_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v6 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -23085,8 +23086,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: .LBB138_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v6 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -23144,8 +23145,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: .LBB138_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v1 ; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_mov_b32_e32 v7, v1 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll index f655d4761fa31..9b26c3f50a4b4 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s + ; --------------------------------------------------------------------- ; atomicrmw xchg ; --------------------------------------------------------------------- @@ -59,11 +60,13 @@ define void @flat_atomic_xchg_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_xchg_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -120,11 +123,13 @@ define i64 @flat_atomic_xchg_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_xchg_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -175,9 +180,9 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -189,9 +194,9 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -201,15 +206,17 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_xchg_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s34 +; GFX9-NEXT: v_mov_b32_e32 v3, s35 +; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -260,9 +267,9 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -274,9 +281,9 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -286,15 +293,17 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-LABEL: flat_atomic_xchg_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v2, s34 +; GFX9-NEXT: v_mov_b32_e32 v3, s35 +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -323,11 +332,13 @@ define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -356,11 +367,13 @@ define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX9-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -421,11 +434,13 @@ define void @flat_atomic_xchg_f64_noret_offset(ptr %out, double %in) { ; GFX9-LABEL: flat_atomic_xchg_f64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds double, ptr %out, i32 4 + %gep = getelementptr double, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 ret void } @@ -482,11 +497,13 @@ define double @flat_atomic_xchg_f64_ret_offset(ptr %out, double %in) { ; GFX9-LABEL: flat_atomic_xchg_f64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds double, ptr %out, i32 4 + %gep = getelementptr double, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 ret double %result } @@ -537,9 +554,9 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -551,9 +568,9 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -563,15 +580,17 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_xchg_f64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s34 +; GFX9-NEXT: v_mov_b32_e32 v3, s35 +; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds double, ptr %out, i32 4 + %gep = getelementptr double, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 ret void } @@ -622,9 +641,9 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -636,9 +655,9 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -648,15 +667,17 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_xchg_f64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v2, s34 +; GFX9-NEXT: v_mov_b32_e32 v3, s35 +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds double, ptr %out, i32 4 + %gep = getelementptr double, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 ret double %result } @@ -685,11 +706,13 @@ define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds double, ptr %out, i64 4 + %gep = getelementptr double, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -718,11 +741,13 @@ define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds double, ptr %out, i64 4 + %gep = getelementptr double, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret double %result } @@ -783,11 +808,13 @@ define void @flat_atomic_add_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_add_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -844,11 +871,13 @@ define i64 @flat_atomic_add_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_add_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -899,9 +928,9 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -913,9 +942,9 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -925,15 +954,17 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_add_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s34 +; GFX9-NEXT: v_mov_b32_e32 v3, s35 +; GFX9-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -984,9 +1015,9 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -998,9 +1029,9 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1010,15 +1041,17 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-LABEL: flat_atomic_add_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v2, s34 +; GFX9-NEXT: v_mov_b32_e32 v3, s35 +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -1047,11 +1080,13 @@ define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -1080,11 +1115,13 @@ define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -1232,14 +1269,18 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_sub_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -1251,7 +1292,7 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -1268,8 +1309,8 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 ; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -1296,8 +1337,8 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -1321,8 +1362,8 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -1355,8 +1396,8 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 ; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_mov_b32_e32 v9, v1 ; GFX7-NEXT: v_sub_i32_e32 v6, vcc, v8, v2 ; GFX7-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc @@ -1383,8 +1424,8 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v8, v2 ; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc @@ -1401,28 +1442,32 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_sub_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, v9, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v10, v3, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB33_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -1431,17 +1476,17 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-LABEL: flat_atomic_sub_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1464,17 +1509,17 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-LABEL: flat_atomic_sub_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1500,9 +1545,9 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1595,19 +1640,23 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_sub_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_add_u32 s36, s4, 32 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s36 +; GFX9-NEXT: v_mov_b32_e32 v6, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -1619,7 +1668,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -1628,23 +1677,23 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_sub_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v1 ; GFX7-NEXT: v_mov_b32_e32 v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, v1 ; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 ; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -1661,23 +1710,23 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-LABEL: flat_atomic_sub_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -1697,15 +1746,15 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 ; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -1741,8 +1790,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v1 ; GFX7-NEXT: v_mov_b32_e32 v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, v1 ; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 ; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -1774,8 +1823,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -1792,31 +1841,35 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-LABEL: flat_atomic_sub_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s6, v5 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v6, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s36 +; GFX9-NEXT: v_mov_b32_e32 v1, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[3:6] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB37_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -1845,11 +1898,13 @@ define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -1878,11 +1933,13 @@ define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -2030,14 +2087,18 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_and_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 ; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -2049,7 +2110,7 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -2199,28 +2260,32 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_and_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 -; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_and_b32_e32 v8, v10, v3 +; GFX9-NEXT: v_and_b32_e32 v7, v9, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB43_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -2229,16 +2294,16 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-LABEL: flat_atomic_and_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2261,16 +2326,16 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-LABEL: flat_atomic_and_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2296,8 +2361,8 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2388,18 +2453,22 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_and_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 ; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -2411,7 +2480,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -2420,16 +2489,16 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_and_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2452,16 +2521,16 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-LABEL: flat_atomic_and_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2487,8 +2556,8 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2579,30 +2648,34 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-LABEL: flat_atomic_and_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_add_u32 s36, s4, 32 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_and_b32_e32 v5, s7, v7 -; GFX9-NEXT: v_and_b32_e32 v4, s6, v6 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB47_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -2631,11 +2704,13 @@ define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -2664,11 +2739,13 @@ define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -2826,8 +2903,12 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_nand_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2835,7 +2916,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 ; GFX9-NEXT: v_not_b32_e32 v5, v4 ; GFX9-NEXT: v_not_b32_e32 v4, v8 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -2847,7 +2928,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -3007,30 +3088,34 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_nand_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 -; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 -; GFX9-NEXT: v_not_b32_e32 v5, v4 -; GFX9-NEXT: v_not_b32_e32 v4, v8 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_and_b32_e32 v0, v10, v3 +; GFX9-NEXT: v_and_b32_e32 v1, v9, v2 +; GFX9-NEXT: v_not_b32_e32 v8, v0 +; GFX9-NEXT: v_not_b32_e32 v7, v1 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB53_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -3039,16 +3124,16 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX7-LABEL: flat_atomic_nand_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3073,16 +3158,16 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX8-LABEL: flat_atomic_nand_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3110,8 +3195,8 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3208,20 +3293,24 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_nand_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_add_u32 s36, s4, 32 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 -; GFX9-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX9-NEXT: v_and_b32_e32 v4, s6, v2 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 ; GFX9-NEXT: v_not_b32_e32 v1, v0 -; GFX9-NEXT: v_not_b32_e32 v0, v6 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: v_not_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3233,7 +3322,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -3242,16 +3331,16 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_nand_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3276,16 +3365,16 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-LABEL: flat_atomic_nand_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3313,8 +3402,8 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3411,32 +3500,36 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-LABEL: flat_atomic_nand_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_and_b32_e32 v0, s7, v7 -; GFX9-NEXT: v_and_b32_e32 v1, s6, v6 -; GFX9-NEXT: v_not_b32_e32 v5, v0 -; GFX9-NEXT: v_not_b32_e32 v4, v1 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, s6, v2 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_not_b32_e32 v1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB57_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -3505,8 +3598,12 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3514,7 +3611,7 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 ; GFX9-NEXT: v_not_b32_e32 v5, v4 ; GFX9-NEXT: v_not_b32_e32 v4, v8 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -3526,7 +3623,7 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -3595,30 +3692,34 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX9-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 -; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 -; GFX9-NEXT: v_not_b32_e32 v5, v4 -; GFX9-NEXT: v_not_b32_e32 v4, v8 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_and_b32_e32 v0, v10, v3 +; GFX9-NEXT: v_and_b32_e32 v1, v9, v2 +; GFX9-NEXT: v_not_b32_e32 v8, v0 +; GFX9-NEXT: v_not_b32_e32 v7, v1 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB59_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -3766,14 +3867,18 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_or_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 ; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -3785,7 +3890,7 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -3935,28 +4040,32 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_or_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 -; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_or_b32_e32 v8, v10, v3 +; GFX9-NEXT: v_or_b32_e32 v7, v9, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB63_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -3965,16 +4074,16 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GFX7-LABEL: flat_atomic_or_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3997,16 +4106,16 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GFX8-LABEL: flat_atomic_or_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB64_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4032,8 +4141,8 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4124,18 +4233,22 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GFX9-LABEL: flat_atomic_or_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 ; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -4147,7 +4260,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -4156,16 +4269,16 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GFX7-LABEL: flat_atomic_or_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4188,16 +4301,16 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GFX8-LABEL: flat_atomic_or_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB66_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4223,8 +4336,8 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4315,30 +4428,34 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-LABEL: flat_atomic_or_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_add_u32 s36, s4, 32 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_or_b32_e32 v5, s7, v7 -; GFX9-NEXT: v_or_b32_e32 v4, s6, v6 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB67_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -4367,11 +4484,13 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -4400,11 +4519,13 @@ define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 ; GFX9-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -4552,14 +4673,18 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_xor_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 ; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -4571,7 +4696,7 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -4721,28 +4846,32 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_xor_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_xor_b32_e32 v8, v10, v3 +; GFX9-NEXT: v_xor_b32_e32 v7, v9, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB73_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -4751,16 +4880,16 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-LABEL: flat_atomic_xor_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4783,16 +4912,16 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-LABEL: flat_atomic_xor_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4818,8 +4947,8 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4910,18 +5039,22 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_xor_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 ; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -4933,7 +5066,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -4942,16 +5075,16 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_xor_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4974,16 +5107,16 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-LABEL: flat_atomic_xor_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5009,8 +5142,8 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5101,30 +5234,34 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-LABEL: flat_atomic_xor_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_add_u32 s36, s4, 32 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_xor_b32_e32 v5, s7, v7 -; GFX9-NEXT: v_xor_b32_e32 v4, s6, v6 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB77_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -5153,11 +5290,13 @@ define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -5186,11 +5325,13 @@ define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -5343,15 +5484,19 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_max_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -5363,7 +5508,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -5518,29 +5663,33 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_max_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v2, v9, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB83_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -5549,18 +5698,18 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-LABEL: flat_atomic_max_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5584,18 +5733,18 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-LABEL: flat_atomic_max_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5622,10 +5771,10 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5723,21 +5872,25 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_max_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s36 +; GFX9-NEXT: v_mov_b32_e32 v7, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -5749,7 +5902,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -5758,18 +5911,18 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_max_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5793,18 +5946,18 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-LABEL: flat_atomic_max_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5831,10 +5984,10 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5932,33 +6085,37 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-LABEL: flat_atomic_max_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[6:7] +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s36 +; GFX9-NEXT: v_mov_b32_e32 v1, s37 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB87_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -6038,21 +6195,27 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 +; GFX9-NEXT: s_add_u32 s4, s0, s4 +; GFX9-NEXT: s_addc_u32 s5, s1, s5 +; GFX9-NEXT: s_add_u32 s0, s4, 32 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_add_u32 s6, s4, 32 +; GFX9-NEXT: s_addc_u32 s7, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6064,8 +6227,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -6150,38 +6313,44 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GFX9-NEXT: s_add_u32 s0, s8, s0 -; GFX9-NEXT: s_addc_u32 s1, s9, s1 +; GFX9-NEXT: s_add_u32 s2, s8, s0 +; GFX9-NEXT: s_addc_u32 s3, s9, s1 +; GFX9-NEXT: s_add_u32 s0, s2, 32 +; GFX9-NEXT: s_addc_u32 s1, s3, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: s_add_u32 s4, s2, 32 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[6:7] +; GFX9-NEXT: s_addc_u32 s5, s3, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB89_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -6284,7 +6453,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -6395,7 +6564,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -6425,11 +6594,13 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -6458,11 +6629,13 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -6615,15 +6788,19 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_umax_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -6635,7 +6812,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -6790,29 +6967,33 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_umax_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v2, v9, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB97_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -6821,18 +7002,18 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX7-LABEL: flat_atomic_umax_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6856,18 +7037,18 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX8-LABEL: flat_atomic_umax_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6894,10 +7075,10 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6995,21 +7176,25 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_umax_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s36 +; GFX9-NEXT: v_mov_b32_e32 v7, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -7021,7 +7206,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7030,18 +7215,18 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_umax_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7065,18 +7250,18 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-LABEL: flat_atomic_umax_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7103,10 +7288,10 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7204,33 +7389,37 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-LABEL: flat_atomic_umax_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7] +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s36 +; GFX9-NEXT: v_mov_b32_e32 v1, s37 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB101_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -7310,21 +7499,27 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 +; GFX9-NEXT: s_add_u32 s4, s0, s4 +; GFX9-NEXT: s_addc_u32 s5, s1, s5 +; GFX9-NEXT: s_add_u32 s0, s4, 32 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_add_u32 s6, s4, 32 +; GFX9-NEXT: s_addc_u32 s7, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -7336,8 +7531,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7422,38 +7617,44 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GFX9-NEXT: s_add_u32 s0, s8, s0 -; GFX9-NEXT: s_addc_u32 s1, s9, s1 +; GFX9-NEXT: s_add_u32 s2, s8, s0 +; GFX9-NEXT: s_addc_u32 s3, s9, s1 +; GFX9-NEXT: s_add_u32 s0, s2, 32 +; GFX9-NEXT: s_addc_u32 s1, s3, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: s_add_u32 s4, s2, 32 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[6:7] +; GFX9-NEXT: s_addc_u32 s5, s3, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB103_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -7565,7 +7766,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -7595,11 +7796,13 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -7628,11 +7831,13 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX9-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -7785,15 +7990,19 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_umin_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -7805,7 +8014,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7960,29 +8169,33 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_umin_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB110_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v2, v9, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB110_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -7991,18 +8204,18 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX7-LABEL: flat_atomic_umin_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8026,18 +8239,18 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX8-LABEL: flat_atomic_umin_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8064,10 +8277,10 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8165,21 +8378,25 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_umin_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s36 +; GFX9-NEXT: v_mov_b32_e32 v7, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -8191,7 +8408,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8200,18 +8417,18 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_umin_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8235,18 +8452,18 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-LABEL: flat_atomic_umin_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8273,10 +8490,10 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8374,33 +8591,37 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-LABEL: flat_atomic_umin_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[6:7] +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s36 +; GFX9-NEXT: v_mov_b32_e32 v1, s37 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB114_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8429,11 +8650,13 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -8462,11 +8685,13 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX9-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -8619,15 +8844,19 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_min_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -8639,7 +8868,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8794,29 +9023,33 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_min_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v2, v9, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB120_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8825,18 +9058,18 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-LABEL: flat_atomic_min_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8860,18 +9093,18 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-LABEL: flat_atomic_min_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8898,10 +9131,10 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8999,21 +9232,25 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_min_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s36 +; GFX9-NEXT: v_mov_b32_e32 v7, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -9025,7 +9262,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -9034,18 +9271,18 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_min_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9069,18 +9306,18 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-LABEL: flat_atomic_min_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9107,10 +9344,10 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9208,33 +9445,37 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-LABEL: flat_atomic_min_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[6:7] +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s36 +; GFX9-NEXT: v_mov_b32_e32 v1, s37 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB124_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -9314,21 +9555,27 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 +; GFX9-NEXT: s_add_u32 s4, s0, s4 +; GFX9-NEXT: s_addc_u32 s5, s1, s5 +; GFX9-NEXT: s_add_u32 s0, s4, 32 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_add_u32 s6, s4, 32 +; GFX9-NEXT: s_addc_u32 s7, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -9340,8 +9587,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -9426,38 +9673,44 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GFX9-NEXT: s_add_u32 s0, s8, s0 -; GFX9-NEXT: s_addc_u32 s1, s9, s1 +; GFX9-NEXT: s_add_u32 s2, s8, s0 +; GFX9-NEXT: s_addc_u32 s3, s9, s1 +; GFX9-NEXT: s_add_u32 s0, s2, 32 +; GFX9-NEXT: s_addc_u32 s1, s3, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: s_add_u32 s4, s2, 32 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[6:7] +; GFX9-NEXT: s_addc_u32 s5, s3, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB126_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -9472,9 +9725,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9502,9 +9755,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9532,9 +9785,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_mov_b32_e32 v6, s3 ; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9664,7 +9917,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -9694,11 +9947,13 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -9727,11 +9982,13 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -9894,8 +10151,12 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB132_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9904,7 +10165,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -9916,7 +10177,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -9933,8 +10194,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GFX7-NEXT: .LBB133_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 1, v6 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] @@ -9964,8 +10225,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GFX8-NEXT: .LBB133_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v6 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] @@ -9992,8 +10253,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GFX9-NEXT: .LBB133_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] @@ -10029,8 +10290,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: .LBB134_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 ; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_mov_b32_e32 v9, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v8 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] @@ -10060,8 +10321,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: .LBB134_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v8 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] @@ -10081,31 +10342,35 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB134_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v10, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v0, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB134_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -10114,16 +10379,16 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB135_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10149,16 +10414,16 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB135_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10187,8 +10452,8 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB135_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10288,21 +10553,25 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB136_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: s_add_u32 s36, s4, 32 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: s_addc_u32 s37, s5, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -10314,7 +10583,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -10323,22 +10592,22 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB137_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v1 ; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -10358,22 +10627,22 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB137_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v1 ; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mov_b32_e32 v7, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -10396,14 +10665,14 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -10441,8 +10710,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX7-NEXT: .LBB138_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v1 ; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -10476,8 +10745,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX8-NEXT: .LBB138_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v1 ; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mov_b32_e32 v7, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -10497,33 +10766,37 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB138_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB138_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -10552,11 +10825,13 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -10585,11 +10860,13 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -10762,8 +11039,12 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB142_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10774,7 +11055,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -10786,7 +11067,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -10961,33 +11242,37 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB144_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[9:10] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[9:10], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v9 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v10, s[6:7] ; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_cndmask_b32_e32 v8, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_cbranch_execnz .LBB144_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -10996,18 +11281,18 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[38:39], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB145_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11035,18 +11320,18 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[38:39], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB145_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11077,10 +11362,10 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB145_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11190,25 +11475,29 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[38:39], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: .LBB146_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] ; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s34 +; GFX9-NEXT: v_mov_b32_e32 v7, s35 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -11220,7 +11509,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -11229,18 +11518,18 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[38:39], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB147_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11268,18 +11557,18 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[38:39], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB147_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11310,10 +11599,10 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB147_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11423,37 +11712,41 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[38:39], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: .LBB148_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[6:7] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v6 ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v7, s[36:37] +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_cbranch_execnz .LBB148_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -11482,11 +11775,13 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -11515,11 +11810,13 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll index 0283b5ff5d439..815843cf85786 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll @@ -806,9 +806,9 @@ define amdgpu_ps <8 x float> @test_fmaximum_v4f64_ss(<4 x double> inreg %a, <4 x ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-SDAG-NEXT: v_max_f64 v[2:3], s[0:1], v[0:1] ; GFX9-SDAG-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v10, 0x7ff80000 ; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-SDAG-NEXT: v_max_f64 v[4:5], s[2:3], v[1:2] ; GFX9-SDAG-NEXT: v_cmp_u_f64_e64 s[0:1], s[2:3], v[1:2] diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 29163c111fc5e..4dc7e436b52b5 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -806,8 +806,8 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v4 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll index 4cccc768d3c50..606c58d1bd470 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll @@ -806,9 +806,9 @@ define amdgpu_ps <8 x float> @test_fminimum_v4f64_ss(<4 x double> inreg %a, <4 x ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-SDAG-NEXT: v_min_f64 v[2:3], s[0:1], v[0:1] ; GFX9-SDAG-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v10, 0x7ff80000 ; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-SDAG-NEXT: v_min_f64 v[4:5], s[2:3], v[1:2] ; GFX9-SDAG-NEXT: v_cmp_u_f64_e64 s[0:1], s[2:3], v[1:2] diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index 02ce8be125afc..1694af9168e0b 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -8,6 +8,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16,GFX11-FLUSH,GFX11-FLUSH-TRUE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16,GFX11-FLUSH,GFX11-FLUSH-FAKE16 %s + ; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't ; make add an instruction if the fadd has more than one use. @@ -88,17 +89,17 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_add_u32 s2, s0, 4 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_add_f32_e64 v2, s4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mac_f32_e64 v3, s4, 2.0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mac_f32_e64 v3, s4, 2.0 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -151,16 +152,16 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: v_add_f32_e64 v2, |s2|, |s2| -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mad_f32 v3, |s2|, 2.0, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mad_f32 v3, |s2|, 2.0, v3 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -212,11 +213,11 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou ; VI-NEXT: s_add_u32 s6, s4, 4 ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: s_addc_u32 s7, s5, 0 ; VI-NEXT: v_mad_f32 v2, |s0|, 2.0, v0 ; VI-NEXT: v_mad_f32 v3, |s0|, 2.0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_addc_u32 s7, s5, 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 @@ -324,9 +325,9 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e32 v0, s2, v0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -556,13 +557,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 +; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 +; VI-DENORM-NEXT: v_add_f16_e64 v2, s4, s4 +; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 ; VI-DENORM-NEXT: v_fma_f16 v3, s4, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 -; VI-DENORM-NEXT: v_add_f16_e64 v2, s4, s4 -; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 -; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 ; VI-DENORM-NEXT: flat_store_short v[0:1], v2 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 @@ -580,16 +581,16 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 -; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 -; VI-FLUSH-NEXT: v_add_f16_e64 v2, s4, s4 ; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2 -; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_add_f16_e64 v2, s4, s4 ; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s3 ; VI-FLUSH-NEXT: s_addc_u32 s3, s1, 0 +; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_mac_f16_e64 v3, s4, 2.0 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 -; VI-FLUSH-NEXT: v_mac_f16_e64 v3, s4, 2.0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v3 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) @@ -712,13 +713,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 +; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 +; VI-DENORM-NEXT: v_add_f16_e64 v2, |s4|, |s4| +; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 ; VI-DENORM-NEXT: v_fma_f16 v3, |s4|, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 -; VI-DENORM-NEXT: v_add_f16_e64 v2, |s4|, |s4| -; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 -; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 ; VI-DENORM-NEXT: flat_store_short v[0:1], v2 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 @@ -736,13 +737,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 +; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3 +; VI-FLUSH-NEXT: v_add_f16_e64 v2, |s4|, |s4| +; VI-FLUSH-NEXT: s_addc_u32 s3, s1, 0 ; VI-FLUSH-NEXT: v_mad_f16 v3, |s4|, 2.0, v0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 -; VI-FLUSH-NEXT: v_add_f16_e64 v2, |s4|, |s4| -; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 -; VI-FLUSH-NEXT: s_addc_u32 s3, s1, 0 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 @@ -869,13 +870,13 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s5, s2, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 +; VI-DENORM-NEXT: s_add_u32 s4, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s5 ; VI-DENORM-NEXT: v_fma_f16 v2, |s2|, 2.0, v0 +; VI-DENORM-NEXT: s_addc_u32 s5, s1, 0 ; VI-DENORM-NEXT: v_fma_f16 v3, |s2|, 2.0, v1 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 -; VI-DENORM-NEXT: s_add_u32 s4, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 -; VI-DENORM-NEXT: s_addc_u32 s5, s1, 0 ; VI-DENORM-NEXT: flat_store_short v[0:1], v3 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s4 @@ -893,13 +894,13 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s5, s2, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3 +; VI-FLUSH-NEXT: s_add_u32 s4, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5 ; VI-FLUSH-NEXT: v_mad_f16 v2, |s2|, 2.0, v0 +; VI-FLUSH-NEXT: s_addc_u32 s5, s1, 0 ; VI-FLUSH-NEXT: v_mad_f16 v3, |s2|, 2.0, v1 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 -; VI-FLUSH-NEXT: s_add_u32 s4, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 -; VI-FLUSH-NEXT: s_addc_u32 s5, s1, 0 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v3 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s4 @@ -1086,9 +1087,9 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e32 v0, s2, v0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mul_f16_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll index 6c2ab5fb15a20..4a84b011fa82f 100644 --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -134,9 +134,9 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_rndne_f32_e32 v1, s3 ; VI-NEXT: v_rndne_f32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -177,10 +177,10 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> % ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_rndne_f32_e32 v2, s2 ; VI-NEXT: v_rndne_f32_e32 v1, s1 ; VI-NEXT: v_rndne_f32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -417,8 +417,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v11, s3 -; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_mov_b32_e32 v10, s2 +; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_mov_b32_e32 v8, s0 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll index b3202cbe30d0b..f834ef89af7e9 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -7498,21 +7498,21 @@ define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(ptr addrspace(1) %o ; SI-NEXT: v_mov_b32_e32 v5, s15 ; SI-NEXT: v_add_i32_e32 v4, vcc, s14, v6 ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v7, s1 ; SI-NEXT: flat_load_dword v8, v[0:1] glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: flat_load_dword v2, v[2:3] glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: flat_load_dword v3, v[4:5] glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, s1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v6 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; SI-NEXT: flat_load_dword v0, v[0:1] glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_trunc_f32_e32 v0, v8 ; SI-NEXT: v_fma_f32 v2, -v0, v2, v3 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm @@ -7532,21 +7532,21 @@ define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(ptr addrspace(1) %o ; VI-NEXT: v_mov_b32_e32 v5, s15 ; VI-NEXT: v_add_u32_e32 v4, vcc, s14, v6 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v7, s1 ; VI-NEXT: flat_load_dword v8, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v3, v[4:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_trunc_f32_e32 v0, v8 ; VI-NEXT: v_fma_f32 v2, -v0, v2, v3 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 833be2066cd54..9c9ac5f6c6f40 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -4088,9 +4088,9 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1 ; SI-NEXT: s_and_b64 s[6:7], s[4:5], exec ; SI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] ; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: v_mov_b32_e32 v3, s3 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm @@ -4108,9 +4108,9 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1 ; VI-NEXT: s_and_b64 s[6:7], s[4:5], exec ; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] ; VI-NEXT: s_cselect_b32 s0, 0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -4147,11 +4147,11 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a ; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_bitcmp1_b32 s2, 16 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, s[2:3] ; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: flat_store_short v[0:1], v2 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll index 5424ebfcffcd1..4c994814dbc6f 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll @@ -410,9 +410,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; CI-NEXT: v_add_f32_e64 v0, s3, 1.0 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -442,9 +442,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -626,9 +626,9 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_or_b32 s3, s3, 0x80008000 ; CI-NEXT: s_or_b32 s2, s2, 0x80008000 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -652,9 +652,9 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> ; VI-NEXT: s_lshl_b32 s4, s4, 16 ; VI-NEXT: s_or_b32 s3, s3, s5 ; VI-NEXT: s_or_b32 s2, s2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 9d9a851a5507e..6a293b80f8711 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -409,9 +409,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; VI-NEXT: v_add_f16_e64 v1, s2, 1.0 ; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -518,9 +518,9 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000 ; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000 -; CIVI-NEXT: v_mov_b32_e32 v3, s1 ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 +; CIVI-NEXT: v_mov_b32_e32 v3, s1 ; CIVI-NEXT: v_mov_b32_e32 v2, s0 ; CIVI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CIVI-NEXT: s_endpgm @@ -590,9 +590,9 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_mul_f16_e64 v1, |s2|, -4.0 ; VI-NEXT: v_mul_f16_sdwa v0, |v2|, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll index 52b6d2cbaa6eb..67cc78cd921d9 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll @@ -191,9 +191,9 @@ define amdgpu_kernel void @fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], doubl ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -227,11 +227,11 @@ define amdgpu_kernel void @fneg_fabs_v2f64(ptr addrspace(1) %out, <2 x double> % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s3, 31 ; VI-NEXT: s_bitset1_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -278,9 +278,9 @@ define amdgpu_kernel void @fneg_fabs_v4f64(ptr addrspace(1) %out, <4 x double> % ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s1 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll index 214ccedd75170..a55f29888d223 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -216,9 +216,9 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s3, 31 ; VI-NEXT: s_bitset1_b32 s2, 31 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -256,11 +256,11 @@ define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> % ; VI-NEXT: s_bitset1_b32 s2, 31 ; VI-NEXT: s_bitset1_b32 s1, 31 ; VI-NEXT: s_bitset1_b32 s0, 31 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index db08cb132a3d7..da37047048fb6 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -1664,10 +1664,10 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: s_cselect_b32 s0, s0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll index 63aadaacbeb3a..802d72129d537 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll @@ -5,6 +5,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1100 -mattr=+real-true16 -mtriple=amdgcn--amdhsa < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn--amdhsa < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s + define amdgpu_kernel void @s_fneg_bf16(ptr addrspace(1) %out, bfloat %in) #0 { ; CI-LABEL: s_fneg_bf16: ; CI: ; %bb.0: @@ -416,8 +417,8 @@ define amdgpu_kernel void @s_fneg_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 { ; CI-LABEL: s_fneg_v2bf16_nonload: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: ;;#ASMSTART ; CI-NEXT: ; def s2 ; CI-NEXT: ;;#ASMEND @@ -426,10 +427,10 @@ define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_and_b32 s2, s2, 0xffff ; CI-NEXT: s_or_b32 s2, s2, s3 ; CI-NEXT: s_add_i32 s2, s2, 0x80000000 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -444,9 +445,9 @@ define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll index cab27fca5ab0a..490623f5795fd 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -376,9 +376,9 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { ; CIVI-NEXT: ;;#ASMEND ; CIVI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 ; CIVI-NEXT: v_mov_b32_e32 v2, s2 ; CIVI-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index 02235151a83e1..31e1be387b035 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -69,9 +69,9 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 ; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -120,11 +120,11 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl ; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; VI-NEXT: s_xor_b32 s1, s1, 0x80000000 ; VI-NEXT: s_xor_b32 s0, s0, 0x80000000 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll index 66d859fbd66ee..9d3e0fa635d41 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll @@ -22,9 +22,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SYS @@ -64,9 +64,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index 4b800e4d47172..c52bb8d785057 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -28,8 +28,8 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f ; VI-NEXT: v_mov_b32_e32 v0, 0x204 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -74,8 +74,8 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture % ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s2|, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -155,8 +155,8 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -203,8 +203,8 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -248,8 +248,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -301,8 +301,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur ; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s6, s6 ; VI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v0 ; VI-NEXT: s_and_b64 s[2:3], s[2:3], vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -355,8 +355,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; VI-NEXT: v_cmp_o_f32_e64 s[4:5], s2, s2 ; VI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s3|, v0 ; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -408,8 +408,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur ; VI-NEXT: v_cmp_u_f32_e64 s[2:3], s6, s6 ; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], |s6|, v0 ; VI-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -458,8 +458,8 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -506,8 +506,8 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -562,8 +562,8 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp ; VI-NEXT: v_cmp_class_f32_e32 vcc, s1, v0 ; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s1, v1 ; VI-NEXT: s_and_b64 s[0:1], s[0:1], vcc -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -614,8 +614,8 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; VI-NEXT: v_mov_b32_e32 v0, 0x204 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -664,8 +664,8 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -713,8 +713,8 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 15619532414ea..316187e064043 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -119,8 +119,8 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 ; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] @@ -484,8 +484,8 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 ; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] @@ -843,8 +843,8 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 ; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1197,8 +1197,8 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 ; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1586,8 +1586,8 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 ; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1934,8 +1934,8 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 ; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll b/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll index 49204f84acb85..3d1d2a231805f 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll @@ -264,8 +264,8 @@ define amdgpu_kernel void @fptoui_f64_to_i64_to_f64(ptr addrspace(1) %out, doubl ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] -; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: s_mov_b32 s5, 0xc1f00000 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: v_fma_f64 v[2:3], v[0:1], s[4:5], v[2:3] ; GFX6-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 11954ab7e5a2c..ea64e20029e89 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -11,6 +11,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX1200,GFX1200-TRUE16 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX1200,GFX1200-FAKE16 %s + define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f16: ; SI: ; %bb.0: @@ -249,8 +250,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_add_u32 s2, s4, 8 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_addc_u32 s3, s5, 0 ; VI-NEXT: flat_load_ushort v0, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -2048,7 +2049,7 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 ; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX1200-FAKE16-NEXT: s_endpgm - ptr addrspace(1) %in2) #0 { + ptr addrspace(1) %in2) #1 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 %r1 = load half, ptr addrspace(1) %gep2, align 4 @@ -2263,8 +2264,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_add_u32 s2, s4, 16 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_addc_u32 s3, s5, 0 ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -3417,7 +3418,7 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX1200-NEXT: v_fmac_f32_e32 v1, v3, v2 ; GFX1200-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1200-NEXT: s_endpgm - ptr addrspace(1) %in2) #0 { + ptr addrspace(1) %in2) #1 { %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 %r0 = load float, ptr addrspace(1) %in1, align 4 %r1 = load float, ptr addrspace(1) %gep2, align 4 @@ -3919,8 +3920,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX10-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 ; GFX10-NEXT: s_cbranch_scc1 .LBB6_5 ; GFX10-NEXT: ; %bb.6: ; %Flow -; GFX10-NEXT: v_mov_b32_e32 v6, v10 ; GFX10-NEXT: v_mov_b32_e32 v13, s2 +; GFX10-NEXT: v_mov_b32_e32 v6, v10 ; GFX10-NEXT: v_mov_b32_e32 v7, v11 ; GFX10-NEXT: .LBB6_7: ; %frem.loop_exit ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 25, v13 @@ -4821,7 +4822,7 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX1200-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX1200-NEXT: global_store_b64 v12, v[0:1], s[0:1] ; GFX1200-NEXT: s_endpgm - ptr addrspace(1) %in2) #0 { + ptr addrspace(1) %in2) #1 { %r0 = load double, ptr addrspace(1) %in1, align 8 %r1 = load double, ptr addrspace(1) %in2, align 8 %r2 = frem afn double %r0, %r1 @@ -5262,8 +5263,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_add_u32 s2, s4, 16 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_addc_u32 s3, s5, 0 ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -16843,8 +16844,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_ldexp_f64 v[10:11], v[10:11], 26 ; GFX10-NEXT: s_cbranch_scc1 .LBB13_5 ; GFX10-NEXT: ; %bb.6: ; %Flow51 -; GFX10-NEXT: v_mov_b32_e32 v10, v14 ; GFX10-NEXT: v_mov_b32_e32 v17, s2 +; GFX10-NEXT: v_mov_b32_e32 v10, v14 ; GFX10-NEXT: v_mov_b32_e32 v11, v15 ; GFX10-NEXT: .LBB13_7: ; %frem.loop_exit24 ; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 25, v17 @@ -16915,8 +16916,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_ldexp_f64 v[12:13], v[12:13], 26 ; GFX10-NEXT: s_cbranch_scc1 .LBB13_13 ; GFX10-NEXT: ; %bb.14: ; %Flow -; GFX10-NEXT: v_mov_b32_e32 v12, v16 ; GFX10-NEXT: v_mov_b32_e32 v19, s2 +; GFX10-NEXT: v_mov_b32_e32 v12, v16 ; GFX10-NEXT: v_mov_b32_e32 v13, v17 ; GFX10-NEXT: .LBB13_15: ; %frem.loop_exit ; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 25, v19 @@ -17562,1360 +17563,5 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ret void } - -define amdgpu_kernel void @frem_v2f64_const_zero_num(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { -; SI-LABEL: frem_v2f64_const_zero_num: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[0:1] -; SI-NEXT: s_and_b64 s[2:3], vcc, exec -; SI-NEXT: s_cselect_b32 s8, 0x7ff80000, 0 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[2:3] -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cselect_b32 s4, 0x7ff80000, 0 -; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: s_endpgm -; -; CI-LABEL: frem_v2f64_const_zero_num: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s4, s2 -; CI-NEXT: s_mov_b32 s5, s3 -; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[0:1] -; CI-NEXT: v_mov_b32_e32 v0, 0 -; CI-NEXT: s_and_b64 s[2:3], vcc, exec -; CI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[2:3] -; CI-NEXT: s_cselect_b32 s8, 0x7ff80000, 0 -; CI-NEXT: s_mov_b32 s2, s6 -; CI-NEXT: s_mov_b32 s3, s7 -; CI-NEXT: v_mov_b32_e32 v1, s8 -; CI-NEXT: v_mov_b32_e32 v2, v0 -; CI-NEXT: s_and_b64 s[4:5], vcc, exec -; CI-NEXT: s_cselect_b32 s4, 0x7ff80000, 0 -; CI-NEXT: v_mov_b32_e32 v3, s4 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; CI-NEXT: s_endpgm -; -; VI-LABEL: frem_v2f64_const_zero_num: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: s_and_b64 s[2:3], vcc, exec -; VI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[2:3] -; VI-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_and_b64 s[0:1], vcc, exec -; VI-NEXT: s_cselect_b32 s0, 0x7ff80000, 0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: frem_v2f64_const_zero_num: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_and_b64 s[2:3], vcc, exec -; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[3:4] -; GFX9-NEXT: s_cselect_b32 s4, 0x7ff80000, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: s_and_b64 s[2:3], vcc, exec -; GFX9-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: frem_v2f64_const_zero_num: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: s_and_b32 s2, vcc_lo, exec_lo -; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4] -; GFX10-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_and_b32 s3, vcc_lo, exec_lo -; GFX10-NEXT: s_cselect_b32 s3, 0x7ff80000, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: frem_v2f64_const_zero_num: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[1:4], v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2] -; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo -; GFX11-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4] -; GFX11-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: s_and_b32 s3, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s3, 0x7ff80000, 0 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 -; GFX11-NEXT: global_store_b128 v0, v[0:3], s[0:1] -; GFX11-NEXT: s_endpgm -; -; GFX1150-LABEL: frem_v2f64_const_zero_num: -; GFX1150: ; %bb.0: -; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-NEXT: v_mov_b32_e32 v0, 0 -; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: global_load_b128 v[1:4], v0, s[2:3] -; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2] -; GFX1150-NEXT: s_and_b32 s2, vcc_lo, exec_lo -; GFX1150-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4] -; GFX1150-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 -; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX1150-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0 -; GFX1150-NEXT: s_and_b32 s3, vcc_lo, exec_lo -; GFX1150-NEXT: s_cselect_b32 s3, 0x7ff80000, 0 -; GFX1150-NEXT: v_mov_b32_e32 v3, s3 -; GFX1150-NEXT: global_store_b128 v0, v[0:3], s[0:1] -; GFX1150-NEXT: s_endpgm -; -; GFX1200-LABEL: frem_v2f64_const_zero_num: -; GFX1200: ; %bb.0: -; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1200-NEXT: v_mov_b32_e32 v0, 0 -; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: global_load_b128 v[1:4], v0, s[2:3] -; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2] -; GFX1200-NEXT: s_and_b32 s2, vcc_lo, exec_lo -; GFX1200-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4] -; GFX1200-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 -; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1200-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0 -; GFX1200-NEXT: s_and_b32 s3, vcc_lo, exec_lo -; GFX1200-NEXT: s_cselect_b32 s3, 0x7ff80000, 0 -; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX1200-NEXT: v_mov_b32_e32 v3, s3 -; GFX1200-NEXT: global_store_b128 v0, v[0:3], s[0:1] -; GFX1200-NEXT: s_endpgm - %r0 = load <2 x double>, ptr addrspace(1) %in, align 16 - %r1 = frem <2 x double> , %r0 - store <2 x double> %r1, ptr addrspace(1) %out, align 16 - ret void -} - -define amdgpu_kernel void @frem_v2f64_const_one_denum(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { -; SI-LABEL: frem_v2f64_const_one_denum: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0 -; SI-NEXT: s_and_b64 vcc, exec, s[2:3] -; SI-NEXT: s_cbranch_vccz .LBB15_2 -; SI-NEXT: ; %bb.1: ; %frem.else16 -; SI-NEXT: v_and_b32_e32 v4, 0x80000000, v1 -; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0 -; SI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc -; SI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc -; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB15_3 -; SI-NEXT: s_branch .LBB15_8 -; SI-NEXT: .LBB15_2: -; SI-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB15_3: ; %frem.compute15 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, 0x7ff00000 -; SI-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3] -; SI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| -; SI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; SI-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] -; SI-NEXT: s_and_b64 s[2:3], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s2, v6 -; SI-NEXT: s_cselect_b32 s3, s2, 0 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_add_i32 s5, s3, -1 -; SI-NEXT: v_ldexp_f64 v[5:6], v[4:5], 26 -; SI-NEXT: s_cmp_lt_i32 s5, 27 -; SI-NEXT: s_cbranch_scc1 .LBB15_7 -; SI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader -; SI-NEXT: s_add_i32 s5, s3, 25 -; SI-NEXT: v_mov_b32_e32 v9, 0x43300000 -; SI-NEXT: v_mov_b32_e32 v4, 0 -; SI-NEXT: s_mov_b32 s3, 0x432fffff -; SI-NEXT: .LBB15_5: ; %frem.loop_body23 -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v8, v6 -; SI-NEXT: v_mov_b32_e32 v7, v5 -; SI-NEXT: v_bfi_b32 v5, s4, v9, v8 -; SI-NEXT: v_add_f64 v[10:11], v[7:8], v[4:5] -; SI-NEXT: v_add_f64 v[5:6], v[10:11], -v[4:5] -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[7:8]|, s[2:3] -; SI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; SI-NEXT: v_add_f64 v[5:6], v[7:8], -v[5:6] -; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[5:6] -; SI-NEXT: v_add_f64 v[10:11], v[5:6], 1.0 -; SI-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc -; SI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc -; SI-NEXT: v_ldexp_f64 v[5:6], v[5:6], 26 -; SI-NEXT: s_sub_i32 s5, s5, 26 -; SI-NEXT: s_cmp_gt_i32 s5, 26 -; SI-NEXT: s_cbranch_scc1 .LBB15_5 -; SI-NEXT: ; %bb.6: ; %Flow50 -; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: .LBB15_7: ; %frem.loop_exit24 -; SI-NEXT: s_sub_i32 s2, s5, 25 -; SI-NEXT: v_ldexp_f64 v[4:5], v[5:6], s2 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s3, 0x432fffff -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[4:5]|, s[2:3] -; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: v_mov_b32_e32 v6, 0x43300000 -; SI-NEXT: v_bfi_b32 v7, s2, v6, v5 -; SI-NEXT: v_mov_b32_e32 v6, 0 -; SI-NEXT: v_add_f64 v[8:9], v[4:5], v[6:7] -; SI-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v5, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v6, v4, vcc -; SI-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] -; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] -; SI-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 -; SI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; SI-NEXT: v_bfi_b32 v5, s2, v5, v1 -; SI-NEXT: .LBB15_8: -; SI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0 -; SI-NEXT: s_and_b64 vcc, exec, s[2:3] -; SI-NEXT: s_cbranch_vccz .LBB15_10 -; SI-NEXT: ; %bb.9: ; %frem.else -; SI-NEXT: v_and_b32_e32 v6, 0x80000000, v3 -; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0 -; SI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc -; SI-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB15_11 -; SI-NEXT: s_branch .LBB15_16 -; SI-NEXT: .LBB15_10: -; SI-NEXT: ; implicit-def: $vgpr6_vgpr7 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB15_11: ; %frem.compute -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_and_b32_e32 v8, 0x7fffffff, v3 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, 0x7ff00000 -; SI-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[2:3] -; SI-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| -; SI-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc -; SI-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] -; SI-NEXT: s_and_b64 s[2:3], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s2, v8 -; SI-NEXT: s_cselect_b32 s3, s2, 0 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_add_i32 s5, s3, -1 -; SI-NEXT: v_ldexp_f64 v[7:8], v[6:7], 26 -; SI-NEXT: s_cmp_lt_i32 s5, 27 -; SI-NEXT: s_cbranch_scc1 .LBB15_15 -; SI-NEXT: ; %bb.12: ; %frem.loop_body.preheader -; SI-NEXT: s_add_i32 s5, s3, 25 -; SI-NEXT: v_mov_b32_e32 v11, 0x43300000 -; SI-NEXT: v_mov_b32_e32 v6, 0 -; SI-NEXT: s_mov_b32 s3, 0x432fffff -; SI-NEXT: .LBB15_13: ; %frem.loop_body -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v10, v8 -; SI-NEXT: v_mov_b32_e32 v9, v7 -; SI-NEXT: v_bfi_b32 v7, s4, v11, v10 -; SI-NEXT: v_add_f64 v[12:13], v[9:10], v[6:7] -; SI-NEXT: v_add_f64 v[7:8], v[12:13], -v[6:7] -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[9:10]|, s[2:3] -; SI-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; SI-NEXT: v_add_f64 v[7:8], v[9:10], -v[7:8] -; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[7:8] -; SI-NEXT: v_add_f64 v[12:13], v[7:8], 1.0 -; SI-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc -; SI-NEXT: v_ldexp_f64 v[7:8], v[7:8], 26 -; SI-NEXT: s_sub_i32 s5, s5, 26 -; SI-NEXT: s_cmp_gt_i32 s5, 26 -; SI-NEXT: s_cbranch_scc1 .LBB15_13 -; SI-NEXT: ; %bb.14: ; %Flow -; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: v_mov_b32_e32 v8, v10 -; SI-NEXT: .LBB15_15: ; %frem.loop_exit -; SI-NEXT: s_sub_i32 s2, s5, 25 -; SI-NEXT: v_ldexp_f64 v[6:7], v[7:8], s2 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s3, 0x432fffff -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[6:7]|, s[2:3] -; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: v_mov_b32_e32 v8, 0x43300000 -; SI-NEXT: v_bfi_b32 v9, s2, v8, v7 -; SI-NEXT: v_mov_b32_e32 v8, 0 -; SI-NEXT: v_add_f64 v[10:11], v[6:7], v[8:9] -; SI-NEXT: v_add_f64 v[8:9], v[10:11], -v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v9, v9, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc -; SI-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] -; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] -; SI-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 -; SI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; SI-NEXT: v_bfi_b32 v7, s2, v7, v3 -; SI-NEXT: .LBB15_16: ; %Flow49 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s5, 0x7ff00000 -; SI-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[4:5] -; SI-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[4:5] -; SI-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: s_endpgm -; -; CI-LABEL: frem_v2f64_const_one_denum: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s4, s2 -; CI-NEXT: s_mov_b32 s5, s3 -; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0 -; CI-NEXT: s_and_b64 vcc, exec, s[2:3] -; CI-NEXT: s_cbranch_vccz .LBB15_2 -; CI-NEXT: ; %bb.1: ; %frem.else16 -; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0 -; CI-NEXT: v_and_b32_e32 v4, 0x80000000, v1 -; CI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc -; CI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc -; CI-NEXT: s_cbranch_execz .LBB15_3 -; CI-NEXT: s_branch .LBB15_8 -; CI-NEXT: .LBB15_2: -; CI-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CI-NEXT: .LBB15_3: ; %frem.compute15 -; CI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| -; CI-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] -; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v6 -; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v8 -; CI-NEXT: s_cbranch_vccnz .LBB15_7 -; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader -; CI-NEXT: v_add_i32_e32 v8, vcc, 25, v6 -; CI-NEXT: .LBB15_5: ; %frem.loop_body23 -; CI-NEXT: ; =>This Inner Loop Header: Depth=1 -; CI-NEXT: v_mov_b32_e32 v7, v5 -; CI-NEXT: v_mov_b32_e32 v6, v4 -; CI-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] -; CI-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] -; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] -; CI-NEXT: v_add_f64 v[9:10], v[4:5], 1.0 -; CI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc -; CI-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc -; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; CI-NEXT: v_subrev_i32_e32 v8, vcc, 26, v8 -; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v8 -; CI-NEXT: s_cbranch_vccnz .LBB15_5 -; CI-NEXT: ; %bb.6: ; %Flow50 -; CI-NEXT: v_mov_b32_e32 v4, v6 -; CI-NEXT: v_mov_b32_e32 v5, v7 -; CI-NEXT: .LBB15_7: ; %frem.loop_exit24 -; CI-NEXT: v_subrev_i32_e32 v6, vcc, 25, v8 -; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; CI-NEXT: s_brev_b32 s2, -2 -; CI-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] -; CI-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] -; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] -; CI-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 -; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; CI-NEXT: v_bfi_b32 v5, s2, v5, v1 -; CI-NEXT: .LBB15_8: -; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0 -; CI-NEXT: s_and_b64 vcc, exec, s[2:3] -; CI-NEXT: s_cbranch_vccz .LBB15_10 -; CI-NEXT: ; %bb.9: ; %frem.else -; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0 -; CI-NEXT: v_and_b32_e32 v6, 0x80000000, v3 -; CI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc -; CI-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; CI-NEXT: s_cbranch_execz .LBB15_11 -; CI-NEXT: s_branch .LBB15_16 -; CI-NEXT: .LBB15_10: -; CI-NEXT: ; implicit-def: $vgpr6_vgpr7 -; CI-NEXT: .LBB15_11: ; %frem.compute -; CI-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| -; CI-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] -; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; CI-NEXT: v_add_i32_e32 v10, vcc, -1, v8 -; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v10 -; CI-NEXT: s_cbranch_vccnz .LBB15_15 -; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader -; CI-NEXT: v_add_i32_e32 v10, vcc, 25, v8 -; CI-NEXT: .LBB15_13: ; %frem.loop_body -; CI-NEXT: ; =>This Inner Loop Header: Depth=1 -; CI-NEXT: v_mov_b32_e32 v9, v7 -; CI-NEXT: v_mov_b32_e32 v8, v6 -; CI-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] -; CI-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] -; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] -; CI-NEXT: v_add_f64 v[11:12], v[6:7], 1.0 -; CI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc -; CI-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc -; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; CI-NEXT: v_subrev_i32_e32 v10, vcc, 26, v10 -; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v10 -; CI-NEXT: s_cbranch_vccnz .LBB15_13 -; CI-NEXT: ; %bb.14: ; %Flow -; CI-NEXT: v_mov_b32_e32 v6, v8 -; CI-NEXT: v_mov_b32_e32 v7, v9 -; CI-NEXT: .LBB15_15: ; %frem.loop_exit -; CI-NEXT: v_subrev_i32_e32 v8, vcc, 25, v10 -; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 -; CI-NEXT: s_brev_b32 s2, -2 -; CI-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] -; CI-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] -; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] -; CI-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 -; CI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; CI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; CI-NEXT: v_bfi_b32 v7, s2, v7, v3 -; CI-NEXT: .LBB15_16: ; %Flow49 -; CI-NEXT: s_mov_b32 s4, 0 -; CI-NEXT: s_mov_b32 s5, 0x7ff00000 -; CI-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[4:5] -; CI-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc -; CI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; CI-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[4:5] -; CI-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc -; CI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; CI-NEXT: s_endpgm -; -; VI-LABEL: frem_v2f64_const_one_denum: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0 -; VI-NEXT: s_and_b64 vcc, exec, s[2:3] -; VI-NEXT: s_cbranch_vccz .LBB15_2 -; VI-NEXT: ; %bb.1: ; %frem.else16 -; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0 -; VI-NEXT: v_and_b32_e32 v4, 0x80000000, v1 -; VI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc -; VI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc -; VI-NEXT: s_cbranch_execz .LBB15_3 -; VI-NEXT: s_branch .LBB15_8 -; VI-NEXT: .LBB15_2: -; VI-NEXT: ; implicit-def: $vgpr4_vgpr5 -; VI-NEXT: .LBB15_3: ; %frem.compute15 -; VI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| -; VI-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] -; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v6 -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v8 -; VI-NEXT: s_cbranch_vccnz .LBB15_7 -; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader -; VI-NEXT: v_add_u32_e32 v8, vcc, 25, v6 -; VI-NEXT: .LBB15_5: ; %frem.loop_body23 -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v7, v5 -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] -; VI-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] -; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] -; VI-NEXT: v_add_f64 v[9:10], v[4:5], 1.0 -; VI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc -; VI-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc -; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; VI-NEXT: v_subrev_u32_e32 v8, vcc, 26, v8 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v8 -; VI-NEXT: s_cbranch_vccnz .LBB15_5 -; VI-NEXT: ; %bb.6: ; %Flow50 -; VI-NEXT: v_mov_b32_e32 v4, v6 -; VI-NEXT: v_mov_b32_e32 v5, v7 -; VI-NEXT: .LBB15_7: ; %frem.loop_exit24 -; VI-NEXT: v_subrev_u32_e32 v6, vcc, 25, v8 -; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; VI-NEXT: s_brev_b32 s2, -2 -; VI-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] -; VI-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] -; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] -; VI-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 -; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; VI-NEXT: v_bfi_b32 v5, s2, v5, v1 -; VI-NEXT: .LBB15_8: -; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0 -; VI-NEXT: s_and_b64 vcc, exec, s[2:3] -; VI-NEXT: s_cbranch_vccz .LBB15_10 -; VI-NEXT: ; %bb.9: ; %frem.else -; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0 -; VI-NEXT: v_and_b32_e32 v6, 0x80000000, v3 -; VI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc -; VI-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; VI-NEXT: s_cbranch_execz .LBB15_11 -; VI-NEXT: s_branch .LBB15_16 -; VI-NEXT: .LBB15_10: -; VI-NEXT: ; implicit-def: $vgpr6_vgpr7 -; VI-NEXT: .LBB15_11: ; %frem.compute -; VI-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| -; VI-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] -; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; VI-NEXT: v_add_u32_e32 v10, vcc, -1, v8 -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v10 -; VI-NEXT: s_cbranch_vccnz .LBB15_15 -; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader -; VI-NEXT: v_add_u32_e32 v10, vcc, 25, v8 -; VI-NEXT: .LBB15_13: ; %frem.loop_body -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v9, v7 -; VI-NEXT: v_mov_b32_e32 v8, v6 -; VI-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] -; VI-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] -; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] -; VI-NEXT: v_add_f64 v[11:12], v[6:7], 1.0 -; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc -; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; VI-NEXT: v_subrev_u32_e32 v10, vcc, 26, v10 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v10 -; VI-NEXT: s_cbranch_vccnz .LBB15_13 -; VI-NEXT: ; %bb.14: ; %Flow -; VI-NEXT: v_mov_b32_e32 v6, v8 -; VI-NEXT: v_mov_b32_e32 v7, v9 -; VI-NEXT: .LBB15_15: ; %frem.loop_exit -; VI-NEXT: v_subrev_u32_e32 v8, vcc, 25, v10 -; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 -; VI-NEXT: s_brev_b32 s2, -2 -; VI-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] -; VI-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] -; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] -; VI-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 -; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; VI-NEXT: v_bfi_b32 v7, s2, v7, v3 -; VI-NEXT: .LBB15_16: ; %Flow49 -; VI-NEXT: s_mov_b32 s2, 0 -; VI-NEXT: s_mov_b32 s3, 0x7ff00000 -; VI-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[2:3] -; VI-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; VI-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; VI-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[2:3] -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc -; VI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: frem_v2f64_const_one_denum: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0 -; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX9-NEXT: s_cbranch_vccz .LBB15_2 -; GFX9-NEXT: ; %bb.1: ; %frem.else16 -; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc -; GFX9-NEXT: s_cbranch_execz .LBB15_3 -; GFX9-NEXT: s_branch .LBB15_8 -; GFX9-NEXT: .LBB15_2: -; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX9-NEXT: .LBB15_3: ; %frem.compute15 -; GFX9-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| -; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] -; GFX9-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX9-NEXT: v_add_u32_e32 v8, -1, v6 -; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v8 -; GFX9-NEXT: s_cbranch_vccnz .LBB15_7 -; GFX9-NEXT: ; %bb.4: ; %frem.loop_body23.preheader -; GFX9-NEXT: v_add_u32_e32 v8, 25, v6 -; GFX9-NEXT: .LBB15_5: ; %frem.loop_body23 -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] -; GFX9-NEXT: v_subrev_u32_e32 v8, 26, v8 -; GFX9-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] -; GFX9-NEXT: v_add_f64 v[9:10], v[4:5], 1.0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc -; GFX9-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 26, v8 -; GFX9-NEXT: s_cbranch_vccnz .LBB15_5 -; GFX9-NEXT: ; %bb.6: ; %Flow50 -; GFX9-NEXT: v_mov_b32_e32 v4, v6 -; GFX9-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-NEXT: .LBB15_7: ; %frem.loop_exit24 -; GFX9-NEXT: v_subrev_u32_e32 v6, 25, v8 -; GFX9-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; GFX9-NEXT: s_brev_b32 s2, -2 -; GFX9-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] -; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] -; GFX9-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_bfi_b32 v5, s2, v5, v1 -; GFX9-NEXT: .LBB15_8: -; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0 -; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX9-NEXT: s_cbranch_vccz .LBB15_10 -; GFX9-NEXT: ; %bb.9: ; %frem.else -; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0 -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; GFX9-NEXT: s_cbranch_execz .LBB15_11 -; GFX9-NEXT: s_branch .LBB15_16 -; GFX9-NEXT: .LBB15_10: -; GFX9-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX9-NEXT: .LBB15_11: ; %frem.compute -; GFX9-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| -; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] -; GFX9-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX9-NEXT: v_add_u32_e32 v10, -1, v8 -; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v10 -; GFX9-NEXT: s_cbranch_vccnz .LBB15_15 -; GFX9-NEXT: ; %bb.12: ; %frem.loop_body.preheader -; GFX9-NEXT: v_add_u32_e32 v10, 25, v8 -; GFX9-NEXT: .LBB15_13: ; %frem.loop_body -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] -; GFX9-NEXT: v_subrev_u32_e32 v10, 26, v10 -; GFX9-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] -; GFX9-NEXT: v_add_f64 v[11:12], v[6:7], 1.0 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc -; GFX9-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 26, v10 -; GFX9-NEXT: s_cbranch_vccnz .LBB15_13 -; GFX9-NEXT: ; %bb.14: ; %Flow -; GFX9-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-NEXT: .LBB15_15: ; %frem.loop_exit -; GFX9-NEXT: v_subrev_u32_e32 v8, 25, v10 -; GFX9-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 -; GFX9-NEXT: s_brev_b32 s2, -2 -; GFX9-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] -; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] -; GFX9-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GFX9-NEXT: v_bfi_b32 v7, s2, v7, v3 -; GFX9-NEXT: .LBB15_16: ; %Flow49 -; GFX9-NEXT: s_mov_b32 s2, 0 -; GFX9-NEXT: s_mov_b32 s3, 0x7ff00000 -; GFX9-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; GFX9-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: frem_v2f64_const_one_denum: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0 -; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 -; GFX10-NEXT: s_cbranch_vccz .LBB15_2 -; GFX10-NEXT: ; %bb.1: ; %frem.else16 -; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0 -; GFX10-NEXT: v_and_b32_e32 v4, 0x80000000, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB15_3 -; GFX10-NEXT: s_branch .LBB15_8 -; GFX10-NEXT: .LBB15_2: -; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX10-NEXT: .LBB15_3: ; %frem.compute15 -; GFX10-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| -; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] -; GFX10-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX10-NEXT: v_add_nc_u32_e32 v8, -1, v6 -; GFX10-NEXT: v_readfirstlane_b32 s2, v6 -; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8 -; GFX10-NEXT: s_cbranch_vccnz .LBB15_7 -; GFX10-NEXT: ; %bb.4: ; %frem.loop_body23.preheader -; GFX10-NEXT: s_add_i32 s2, s2, 25 -; GFX10-NEXT: .LBB15_5: ; %frem.loop_body23 -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: s_sub_i32 s2, s2, 26 -; GFX10-NEXT: s_cmp_gt_i32 s2, 26 -; GFX10-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] -; GFX10-NEXT: v_add_f64 v[8:9], v[4:5], 1.0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo -; GFX10-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX10-NEXT: s_cbranch_scc1 .LBB15_5 -; GFX10-NEXT: ; %bb.6: ; %Flow50 -; GFX10-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v8, s2 -; GFX10-NEXT: v_mov_b32_e32 v5, v7 -; GFX10-NEXT: .LBB15_7: ; %frem.loop_exit24 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 25, v8 -; GFX10-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; GFX10-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] -; GFX10-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] -; GFX10-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo -; GFX10-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 -; GFX10-NEXT: .LBB15_8: -; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0 -; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 -; GFX10-NEXT: s_cbranch_vccz .LBB15_10 -; GFX10-NEXT: ; %bb.9: ; %frem.else -; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0 -; GFX10-NEXT: v_and_b32_e32 v6, 0x80000000, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB15_11 -; GFX10-NEXT: s_branch .LBB15_16 -; GFX10-NEXT: .LBB15_10: -; GFX10-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX10-NEXT: .LBB15_11: ; %frem.compute -; GFX10-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| -; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] -; GFX10-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX10-NEXT: v_add_nc_u32_e32 v10, -1, v8 -; GFX10-NEXT: v_readfirstlane_b32 s2, v8 -; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10 -; GFX10-NEXT: s_cbranch_vccnz .LBB15_15 -; GFX10-NEXT: ; %bb.12: ; %frem.loop_body.preheader -; GFX10-NEXT: s_add_i32 s2, s2, 25 -; GFX10-NEXT: .LBB15_13: ; %frem.loop_body -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v9, v7 -; GFX10-NEXT: v_mov_b32_e32 v8, v6 -; GFX10-NEXT: s_sub_i32 s2, s2, 26 -; GFX10-NEXT: s_cmp_gt_i32 s2, 26 -; GFX10-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] -; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] -; GFX10-NEXT: v_add_f64 v[10:11], v[6:7], 1.0 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX10-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX10-NEXT: s_cbranch_scc1 .LBB15_13 -; GFX10-NEXT: ; %bb.14: ; %Flow -; GFX10-NEXT: v_mov_b32_e32 v6, v8 -; GFX10-NEXT: v_mov_b32_e32 v10, s2 -; GFX10-NEXT: v_mov_b32_e32 v7, v9 -; GFX10-NEXT: .LBB15_15: ; %frem.loop_exit -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 25, v10 -; GFX10-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 -; GFX10-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] -; GFX10-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] -; GFX10-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo -; GFX10-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3 -; GFX10-NEXT: .LBB15_16: ; %Flow49 -; GFX10-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo -; GFX10-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]| -; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7ff80000, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: frem_v2f64_const_one_denum: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0 -; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_vccz .LBB15_2 -; GFX11-NEXT: ; %bb.1: ; %frem.else16 -; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0 -; GFX11-NEXT: v_and_b32_e32 v4, 0x80000000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo -; GFX11-NEXT: s_cbranch_execz .LBB15_3 -; GFX11-NEXT: s_branch .LBB15_8 -; GFX11-NEXT: .LBB15_2: -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX11-NEXT: .LBB15_3: ; %frem.compute15 -; GFX11-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| -; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX11-NEXT: v_add_nc_u32_e32 v8, -1, v6 -; GFX11-NEXT: v_readfirstlane_b32 s2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8 -; GFX11-NEXT: s_cbranch_vccnz .LBB15_7 -; GFX11-NEXT: ; %bb.4: ; %frem.loop_body23.preheader -; GFX11-NEXT: s_add_i32 s2, s2, 25 -; GFX11-NEXT: .LBB15_5: ; %frem.loop_body23 -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_sub_i32 s2, s2, 26 -; GFX11-NEXT: s_cmp_gt_i32 s2, 26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] -; GFX11-NEXT: v_add_f64 v[8:9], v[4:5], 1.0 -; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX11-NEXT: s_cbranch_scc1 .LBB15_5 -; GFX11-NEXT: ; %bb.6: ; %Flow50 -; GFX11-NEXT: v_mov_b32_e32 v4, v6 -; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7 -; GFX11-NEXT: .LBB15_7: ; %frem.loop_exit24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 25, v8 -; GFX11-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] -; GFX11-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 -; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 -; GFX11-NEXT: .LBB15_8: -; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0 -; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_vccz .LBB15_10 -; GFX11-NEXT: ; %bb.9: ; %frem.else -; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0 -; GFX11-NEXT: v_and_b32_e32 v6, 0x80000000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo -; GFX11-NEXT: s_cbranch_execz .LBB15_11 -; GFX11-NEXT: s_branch .LBB15_16 -; GFX11-NEXT: .LBB15_10: -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX11-NEXT: .LBB15_11: ; %frem.compute -; GFX11-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| -; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX11-NEXT: v_add_nc_u32_e32 v10, -1, v8 -; GFX11-NEXT: v_readfirstlane_b32 s2, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10 -; GFX11-NEXT: s_cbranch_vccnz .LBB15_15 -; GFX11-NEXT: ; %bb.12: ; %frem.loop_body.preheader -; GFX11-NEXT: s_add_i32 s2, s2, 25 -; GFX11-NEXT: .LBB15_13: ; %frem.loop_body -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 -; GFX11-NEXT: s_sub_i32 s2, s2, 26 -; GFX11-NEXT: s_cmp_gt_i32 s2, 26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] -; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] -; GFX11-NEXT: v_add_f64 v[10:11], v[6:7], 1.0 -; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX11-NEXT: s_cbranch_scc1 .LBB15_13 -; GFX11-NEXT: ; %bb.14: ; %Flow -; GFX11-NEXT: v_mov_b32_e32 v6, v8 -; GFX11-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9 -; GFX11-NEXT: .LBB15_15: ; %frem.loop_exit -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 25, v10 -; GFX11-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] -; GFX11-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3 -; GFX11-NEXT: .LBB15_16: ; %Flow49 -; GFX11-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo -; GFX11-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]| -; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_endpgm -; -; GFX1150-LABEL: frem_v2f64_const_one_denum: -; GFX1150: ; %bb.0: -; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-NEXT: v_mov_b32_e32 v0, 0 -; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: global_load_b128 v[0:3], v0, s[2:3] -; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0 -; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2 -; GFX1150-NEXT: s_cbranch_vccz .LBB15_2 -; GFX1150-NEXT: ; %bb.1: ; %frem.else16 -; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0 -; GFX1150-NEXT: v_and_b32_e32 v4, 0x80000000, v1 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1150-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo -; GFX1150-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo -; GFX1150-NEXT: s_cbranch_execz .LBB15_3 -; GFX1150-NEXT: s_branch .LBB15_8 -; GFX1150-NEXT: .LBB15_2: -; GFX1150-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1150-NEXT: .LBB15_3: ; %frem.compute15 -; GFX1150-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| -; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1150-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX1150-NEXT: v_add_nc_u32_e32 v8, -1, v6 -; GFX1150-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8 -; GFX1150-NEXT: s_cbranch_vccnz .LBB15_7 -; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body23.preheader -; GFX1150-NEXT: s_add_i32 s2, s2, 25 -; GFX1150-NEXT: .LBB15_5: ; %frem.loop_body23 -; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1150-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX1150-NEXT: s_sub_i32 s2, s2, 26 -; GFX1150-NEXT: s_cmp_gt_i32 s2, 26 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] -; GFX1150-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] -; GFX1150-NEXT: v_add_f64 v[8:9], v[4:5], 1.0 -; GFX1150-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX1150-NEXT: s_cbranch_scc1 .LBB15_5 -; GFX1150-NEXT: ; %bb.6: ; %Flow50 -; GFX1150-NEXT: v_mov_b32_e32 v4, v6 -; GFX1150-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7 -; GFX1150-NEXT: .LBB15_7: ; %frem.loop_exit24 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_subrev_nc_u32_e32 v6, 25, v8 -; GFX1150-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] -; GFX1150-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] -; GFX1150-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 -; GFX1150-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 -; GFX1150-NEXT: .LBB15_8: -; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0 -; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2 -; GFX1150-NEXT: s_cbranch_vccz .LBB15_10 -; GFX1150-NEXT: ; %bb.9: ; %frem.else -; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0 -; GFX1150-NEXT: v_and_b32_e32 v6, 0x80000000, v3 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1150-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo -; GFX1150-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo -; GFX1150-NEXT: s_cbranch_execz .LBB15_11 -; GFX1150-NEXT: s_branch .LBB15_16 -; GFX1150-NEXT: .LBB15_10: -; GFX1150-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX1150-NEXT: .LBB15_11: ; %frem.compute -; GFX1150-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| -; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1150-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX1150-NEXT: v_add_nc_u32_e32 v10, -1, v8 -; GFX1150-NEXT: v_readfirstlane_b32 s2, v8 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10 -; GFX1150-NEXT: s_cbranch_vccnz .LBB15_15 -; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body.preheader -; GFX1150-NEXT: s_add_i32 s2, s2, 25 -; GFX1150-NEXT: .LBB15_13: ; %frem.loop_body -; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1150-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 -; GFX1150-NEXT: s_sub_i32 s2, s2, 26 -; GFX1150-NEXT: s_cmp_gt_i32 s2, 26 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] -; GFX1150-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] -; GFX1150-NEXT: v_add_f64 v[10:11], v[6:7], 1.0 -; GFX1150-NEXT: v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX1150-NEXT: s_cbranch_scc1 .LBB15_13 -; GFX1150-NEXT: ; %bb.14: ; %Flow -; GFX1150-NEXT: v_mov_b32_e32 v6, v8 -; GFX1150-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9 -; GFX1150-NEXT: .LBB15_15: ; %frem.loop_exit -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_subrev_nc_u32_e32 v8, 25, v10 -; GFX1150-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] -; GFX1150-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] -; GFX1150-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 -; GFX1150-NEXT: v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3 -; GFX1150-NEXT: .LBB15_16: ; %Flow49 -; GFX1150-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| -; GFX1150-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo -; GFX1150-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo -; GFX1150-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]| -; GFX1150-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7 -; GFX1150-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo -; GFX1150-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX1150-NEXT: s_endpgm -; -; GFX1200-LABEL: frem_v2f64_const_one_denum: -; GFX1200: ; %bb.0: -; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1200-NEXT: v_mov_b32_e32 v0, 0 -; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: global_load_b128 v[0:3], v0, s[2:3] -; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0 -; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2 -; GFX1200-NEXT: s_cbranch_vccz .LBB15_2 -; GFX1200-NEXT: ; %bb.1: ; %frem.else16 -; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0 -; GFX1200-NEXT: v_and_b32_e32 v4, 0x80000000, v1 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1200-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo -; GFX1200-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo -; GFX1200-NEXT: s_cbranch_execz .LBB15_3 -; GFX1200-NEXT: s_branch .LBB15_8 -; GFX1200-NEXT: .LBB15_2: -; GFX1200-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1200-NEXT: .LBB15_3: ; %frem.compute15 -; GFX1200-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| -; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX1200-NEXT: v_add_nc_u32_e32 v8, -1, v6 -; GFX1200-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8 -; GFX1200-NEXT: s_cbranch_vccnz .LBB15_7 -; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body23.preheader -; GFX1200-NEXT: s_add_co_i32 s2, s2, 25 -; GFX1200-NEXT: .LBB15_5: ; %frem.loop_body23 -; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1200-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX1200-NEXT: s_sub_co_i32 s2, s2, 26 -; GFX1200-NEXT: s_cmp_gt_i32 s2, 26 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] -; GFX1200-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[4:5] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] -; GFX1200-NEXT: v_add_f64_e32 v[8:9], 1.0, v[4:5] -; GFX1200-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX1200-NEXT: s_cbranch_scc1 .LBB15_5 -; GFX1200-NEXT: ; %bb.6: ; %Flow50 -; GFX1200-NEXT: v_mov_b32_e32 v4, v6 -; GFX1200-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7 -; GFX1200-NEXT: .LBB15_7: ; %frem.loop_exit24 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_subrev_nc_u32_e32 v6, 25, v8 -; GFX1200-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] -; GFX1200-NEXT: v_add_f64_e64 v[4:5], v[4:5], -v[6:7] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] -; GFX1200-NEXT: v_add_f64_e32 v[6:7], 1.0, v[4:5] -; GFX1200-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 -; GFX1200-NEXT: .LBB15_8: -; GFX1200-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0 -; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2 -; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX1200-NEXT: s_cbranch_vccz .LBB15_10 -; GFX1200-NEXT: ; %bb.9: ; %frem.else -; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0 -; GFX1200-NEXT: v_and_b32_e32 v6, 0x80000000, v3 -; GFX1200-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1200-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo -; GFX1200-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo -; GFX1200-NEXT: s_cbranch_execz .LBB15_11 -; GFX1200-NEXT: s_branch .LBB15_16 -; GFX1200-NEXT: .LBB15_10: -; GFX1200-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX1200-NEXT: .LBB15_11: ; %frem.compute -; GFX1200-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| -; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX1200-NEXT: v_add_nc_u32_e32 v10, -1, v8 -; GFX1200-NEXT: v_readfirstlane_b32 s2, v8 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10 -; GFX1200-NEXT: s_cbranch_vccnz .LBB15_15 -; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body.preheader -; GFX1200-NEXT: s_add_co_i32 s2, s2, 25 -; GFX1200-NEXT: .LBB15_13: ; %frem.loop_body -; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 -; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX1200-NEXT: s_sub_co_i32 s2, s2, 26 -; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX1200-NEXT: s_cmp_gt_i32 s2, 26 -; GFX1200-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_add_f64_e64 v[6:7], v[8:9], -v[6:7] -; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] -; GFX1200-NEXT: v_add_f64_e32 v[10:11], 1.0, v[6:7] -; GFX1200-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10 -; GFX1200-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX1200-NEXT: s_cbranch_scc1 .LBB15_13 -; GFX1200-NEXT: ; %bb.14: ; %Flow -; GFX1200-NEXT: v_mov_b32_e32 v6, v8 -; GFX1200-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9 -; GFX1200-NEXT: .LBB15_15: ; %frem.loop_exit -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_subrev_nc_u32_e32 v8, 25, v10 -; GFX1200-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] -; GFX1200-NEXT: v_add_f64_e64 v[6:7], v[6:7], -v[8:9] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] -; GFX1200-NEXT: v_add_f64_e32 v[8:9], 1.0, v[6:7] -; GFX1200-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-NEXT: v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3 -; GFX1200-NEXT: .LBB15_16: ; %Flow49 -; GFX1200-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| -; GFX1200-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo -; GFX1200-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo -; GFX1200-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]| -; GFX1200-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7 -; GFX1200-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo -; GFX1200-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX1200-NEXT: s_endpgm - %r0 = load <2 x double>, ptr addrspace(1) %in, align 16 - %r1 = frem <2 x double> %r0, - store <2 x double> %r1, ptr addrspace(1) %out, align 16 - ret void -} - -define amdgpu_kernel void @frem_v2f64_const(ptr addrspace(1) %out) #0 { -; SI-LABEL: frem_v2f64_const: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: v_mov_b32_e32 v3, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: s_endpgm -; -; CI-LABEL: frem_v2f64_const: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CI-NEXT: v_mov_b32_e32 v0, 0 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; CI-NEXT: v_mov_b32_e32 v2, v0 -; CI-NEXT: v_mov_b32_e32 v3, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; CI-NEXT: s_endpgm -; -; VI-LABEL: frem_v2f64_const: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v3, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: frem_v2f64_const: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: frem_v2f64_const: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: frem_v2f64_const: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b128 v0, v[0:3], s[0:1] -; GFX11-NEXT: s_endpgm -; -; GFX1150-LABEL: frem_v2f64_const: -; GFX1150: ; %bb.0: -; GFX1150-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1150-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-NEXT: v_mov_b32_e32 v2, v0 -; GFX1150-NEXT: v_mov_b32_e32 v3, v0 -; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: global_store_b128 v0, v[0:3], s[0:1] -; GFX1150-NEXT: s_endpgm -; -; GFX1200-LABEL: frem_v2f64_const: -; GFX1200: ; %bb.0: -; GFX1200-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1200-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0 -; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: global_store_b128 v0, v[0:3], s[0:1] -; GFX1200-NEXT: s_endpgm - %r0 = frem <2 x double> , - store <2 x double> %r0, ptr addrspace(1) %out, align 16 - ret void -} - - - -attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 72c2003058a01..c77806e3e6ebc 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -6,6 +6,7 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11 + declare i32 @llvm.fshl.i32(i32, i32, i32) nounwind readnone declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone @@ -256,9 +257,9 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_and_b32 s2, s2, 31 ; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -397,9 +398,9 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; VI-NEXT: s_mov_b32 s3, s0 ; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 23 ; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 25 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -555,11 +556,11 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_and_b32 s0, s0, 31 ; VI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 -; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -768,11 +769,11 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; VI-NEXT: s_lshr_b64 s[4:5], s[14:15], 23 ; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 25 ; VI-NEXT: s_lshr_b64 s[8:9], s[12:13], 31 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index 7afb2cf317869..d62499f328ffb 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -9,6 +9,7 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX12,GFX12-TRUE16 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16 + declare i32 @llvm.fshr.i32(i32, i32, i32) declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>) @@ -222,106 +223,6 @@ entry: ret void } -define amdgpu_kernel void @fshr_i32_imm_src0(ptr addrspace(1) %in, i32 %x, i32 %y) { -; SI-LABEL: fshr_i32_imm_src0: -; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s9, 7 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s8, s3 -; SI-NEXT: s_and_b32 s0, s2, 31 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: s_endpgm -; -; VI-LABEL: fshr_i32_imm_src0: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s5, 7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s3 -; VI-NEXT: s_and_b32 s2, s2, 31 -; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fshr_i32_imm_src0: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s5, 7 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s3 -; GFX9-NEXT: s_and_b32 s2, s2, 31 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; R600-LABEL: fshr_i32_imm_src0: -; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 -; R600-NEXT: CF_END -; R600-NEXT: PAD -; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, -; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T1.X, literal.x, KC0[2].W, KC0[2].Z, -; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) -; -; GFX10-LABEL: fshr_i32_imm_src0: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: s_mov_b32 s5, 7 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s4, s3 -; GFX10-NEXT: s_and_b32 s2, s2, 31 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: fshr_i32_imm_src0: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s5, 7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s4, s3 -; GFX11-NEXT: s_and_b32 s2, s2, 31 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: fshr_i32_imm_src0: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s5, 7 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, s3 -; GFX12-NEXT: s_and_b32 s2, s2, 31 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_endpgm -entry: - %0 = call i32 @llvm.fshr.i32(i32 7, i32 %y, i32 %x) - store i32 %0, ptr addrspace(1) %in - ret void -} - define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshr_v2i32: ; SI: ; %bb.0: ; %entry @@ -356,9 +257,9 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; VI-NEXT: s_and_b32 s0, s6, 31 ; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 ; VI-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -489,9 +390,9 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; VI-NEXT: s_mov_b32 s3, s0 ; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 9 ; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 7 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -582,145 +483,6 @@ entry: ret void } -define amdgpu_kernel void @fshr_v2i32_imm_src1(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { -; SI-LABEL: fshr_v2i32_imm_src1: -; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s8, 9 -; SI-NEXT: s_mov_b32 s10, 7 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_and_b32 s1, s3, 31 -; SI-NEXT: s_mov_b32 s11, s0 -; SI-NEXT: s_and_b32 s0, s2, 31 -; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 -; SI-NEXT: s_lshr_b64 s[0:1], s[10:11], s0 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; SI-NEXT: s_endpgm -; -; VI-LABEL: fshr_v2i32_imm_src1: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s6, 9 -; VI-NEXT: s_mov_b32 s8, 7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s7, s1 -; VI-NEXT: s_and_b32 s1, s3, 31 -; VI-NEXT: s_mov_b32 s9, s0 -; VI-NEXT: s_and_b32 s0, s2, 31 -; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 -; VI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fshr_v2i32_imm_src1: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s4, 9 -; GFX9-NEXT: s_mov_b32 s8, 7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_and_b32 s1, s3, 31 -; GFX9-NEXT: s_mov_b32 s9, s0 -; GFX9-NEXT: s_and_b32 s0, s2, 31 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] -; GFX9-NEXT: s_endpgm -; -; R600-LABEL: fshr_v2i32_imm_src1: -; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; R600-NEXT: CF_END -; R600-NEXT: PAD -; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, literal.x, KC0[3].Z, -; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, literal.x, KC0[3].Y, -; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) -; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; -; GFX10-LABEL: fshr_v2i32_imm_src1: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX10-NEXT: s_mov_b32 s4, 9 -; GFX10-NEXT: s_mov_b32 s8, 7 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s5, s1 -; GFX10-NEXT: s_mov_b32 s9, s0 -; GFX10-NEXT: s_and_b32 s0, s2, 31 -; GFX10-NEXT: s_and_b32 s2, s3, 31 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: fshr_v2i32_imm_src1: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s6, 9 -; GFX11-NEXT: s_mov_b32 s8, 7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s7, s1 -; GFX11-NEXT: s_mov_b32 s9, s0 -; GFX11-NEXT: s_and_b32 s0, s2, 31 -; GFX11-NEXT: s_and_b32 s2, s3, 31 -; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 -; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: fshr_v2i32_imm_src1: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s6, 9 -; GFX12-NEXT: s_mov_b32 s8, 7 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s7, s1 -; GFX12-NEXT: s_mov_b32 s9, s0 -; GFX12-NEXT: s_and_b32 s0, s2, 31 -; GFX12-NEXT: s_and_b32 s2, s3, 31 -; GFX12-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 -; GFX12-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_endpgm -entry: - %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> , <2 x i32> %y) - store <2 x i32> %0, ptr addrspace(1) %in - ret void -} - define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; SI-LABEL: fshr_v4i32: ; SI: ; %bb.0: ; %entry @@ -771,11 +533,11 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; VI-NEXT: s_lshr_b64 s[2:3], s[14:15], s2 ; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], s1 ; VI-NEXT: s_lshr_b64 s[0:1], s[12:13], s0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -956,11 +718,11 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; VI-NEXT: s_lshr_b64 s[4:5], s[14:15], 9 ; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 7 ; VI-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -1079,194 +841,6 @@ entry: ret void } -define amdgpu_kernel void @fshr_v4i32_imm_src0(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { -; SI-LABEL: fshr_v4i32_imm_src0: -; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 33 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, s11 -; SI-NEXT: s_and_b32 s4, s15, 31 -; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 -; SI-NEXT: s_mov_b32 s11, 9 -; SI-NEXT: s_and_b32 s5, s14, 31 -; SI-NEXT: s_lshr_b64 s[6:7], s[10:11], s5 -; SI-NEXT: s_mov_b32 s11, 7 -; SI-NEXT: s_mov_b32 s10, s9 -; SI-NEXT: s_and_b32 s5, s13, 31 -; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s5 -; SI-NEXT: s_mov_b32 s9, 1 -; SI-NEXT: s_and_b32 s5, s12, 31 -; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s5 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: s_endpgm -; -; VI-LABEL: fshr_v4i32_imm_src0: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; VI-NEXT: s_mov_b32 s1, 33 -; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s11 -; VI-NEXT: s_and_b32 s4, s15, 31 -; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 -; VI-NEXT: s_mov_b32 s11, 9 -; VI-NEXT: s_and_b32 s1, s14, 31 -; VI-NEXT: s_lshr_b64 s[4:5], s[10:11], s1 -; VI-NEXT: s_mov_b32 s6, s9 -; VI-NEXT: s_and_b32 s1, s13, 31 -; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 -; VI-NEXT: s_mov_b32 s9, 1 -; VI-NEXT: s_and_b32 s1, s12, 31 -; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fshr_v4i32_imm_src0: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GFX9-NEXT: s_mov_b32 s1, 33 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 7 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s11 -; GFX9-NEXT: s_and_b32 s4, s15, 31 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 -; GFX9-NEXT: s_mov_b32 s11, 9 -; GFX9-NEXT: s_and_b32 s1, s14, 31 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[10:11], s1 -; GFX9-NEXT: s_mov_b32 s6, s9 -; GFX9-NEXT: s_and_b32 s1, s13, 31 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 -; GFX9-NEXT: s_mov_b32 s9, 1 -; GFX9-NEXT: s_and_b32 s1, s12, 31 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] -; GFX9-NEXT: s_endpgm -; -; R600-LABEL: fshr_v4i32_imm_src0: -; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 -; R600-NEXT: CF_END -; R600-NEXT: PAD -; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: BIT_ALIGN_INT * T0.W, literal.x, KC0[4].X, KC0[5].X, -; R600-NEXT: 33(4.624285e-44), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T0.Z, literal.x, KC0[3].W, KC0[4].W, -; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T0.Y, literal.x, KC0[3].Z, KC0[4].Z, -; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T0.X, 1, KC0[3].Y, KC0[4].Y, -; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; -; GFX10-LABEL: fshr_v4i32_imm_src0: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX10-NEXT: s_mov_b32 s1, 33 -; GFX10-NEXT: s_mov_b32 s3, 7 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s11 -; GFX10-NEXT: s_and_b32 s4, s15, 31 -; GFX10-NEXT: s_mov_b32 s11, 9 -; GFX10-NEXT: s_and_b32 s5, s14, 31 -; GFX10-NEXT: s_mov_b32 s2, s9 -; GFX10-NEXT: s_and_b32 s13, s13, 31 -; GFX10-NEXT: s_mov_b32 s9, 1 -; GFX10-NEXT: s_and_b32 s12, s12, 31 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[10:11], s5 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: fshr_v4i32_imm_src0: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s1, 33 -; GFX11-NEXT: s_mov_b32 s3, 7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, s11 -; GFX11-NEXT: s_and_b32 s6, s15, 31 -; GFX11-NEXT: s_mov_b32 s11, 9 -; GFX11-NEXT: s_and_b32 s7, s14, 31 -; GFX11-NEXT: s_mov_b32 s2, s9 -; GFX11-NEXT: s_and_b32 s13, s13, 31 -; GFX11-NEXT: s_mov_b32 s9, 1 -; GFX11-NEXT: s_and_b32 s12, s12, 31 -; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s6 -; GFX11-NEXT: s_lshr_b64 s[6:7], s[10:11], s7 -; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 -; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: fshr_v4i32_imm_src0: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s1, 33 -; GFX12-NEXT: s_mov_b32 s3, 7 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s0, s11 -; GFX12-NEXT: s_and_b32 s6, s15, 31 -; GFX12-NEXT: s_mov_b32 s11, 9 -; GFX12-NEXT: s_and_b32 s7, s14, 31 -; GFX12-NEXT: s_mov_b32 s2, s9 -; GFX12-NEXT: s_and_b32 s13, s13, 31 -; GFX12-NEXT: s_mov_b32 s9, 1 -; GFX12-NEXT: s_and_b32 s12, s12, 31 -; GFX12-NEXT: s_lshr_b64 s[0:1], s[0:1], s6 -; GFX12-NEXT: s_lshr_b64 s[6:7], s[10:11], s7 -; GFX12-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 -; GFX12-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] -; GFX12-NEXT: s_endpgm -entry: - %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> , <4 x i32> %x, <4 x i32> %y) - store <4 x i32> %0, ptr addrspace(1) %in - ret void -} - define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) { ; GFX89-LABEL: v_fshr_i32: ; GFX89: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index c06011c259f9b..93bce89baa34f 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -1990,10 +1990,10 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s3, s0 ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:2032 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 21e5994819997..851dbf34f65f5 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -7583,9 +7583,9 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v0, v8 ; GFX7-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v3, v11 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7618,9 +7618,9 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v0, v8 ; GFX6-NEXT: v_mov_b32_e32 v1, v9 -; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v3, v11 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7817,9 +7817,9 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v0, v8 ; GFX7-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v3, v11 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7852,9 +7852,9 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v0, v8 ; GFX6-NEXT: v_mov_b32_e32 v1, v9 -; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v3, v11 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8054,9 +8054,9 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v3, v9 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[10:11], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8091,9 +8091,9 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v0, v6 ; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v3, v9 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[10:11], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9739,11 +9739,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9784,11 +9784,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10263,11 +10263,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10308,11 +10308,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10759,11 +10759,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10801,11 +10801,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -11260,11 +11260,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -11303,11 +11303,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -11766,11 +11766,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -11809,11 +11809,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -12160,10 +12160,10 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12195,10 +12195,10 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -12533,10 +12533,10 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v5, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12567,10 +12567,10 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v3 ; GFX6-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -13044,11 +13044,11 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -13089,11 +13089,11 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -13555,11 +13555,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -13598,11 +13598,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -19568,9 +19568,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -19618,9 +19618,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -19812,9 +19812,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -19862,9 +19862,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -20061,9 +20061,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -20113,9 +20113,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -20986,9 +20986,9 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -21036,9 +21036,9 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -21471,9 +21471,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -21521,9 +21521,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -21961,9 +21961,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -22011,9 +22011,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -22439,9 +22439,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -22489,9 +22489,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index dbd48d2a7cf8f..92642943ddec9 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s + ; -------------------------------------------------------------------- ; float ; -------------------------------------------------------------------- @@ -4210,11 +4211,11 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_max_f64 v[6:7], v[0:1], v[10:11] +; GFX7-NEXT: v_mov_b32_e32 v3, v9 ; GFX7-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 -; GFX7-NEXT: v_mov_b32_e32 v3, v9 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -4244,11 +4245,11 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_max_f64 v[6:7], v[0:1], v[10:11] +; GFX6-NEXT: v_mov_b32_e32 v3, v9 ; GFX6-NEXT: v_mov_b32_e32 v0, v6 ; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 -; GFX6-NEXT: v_mov_b32_e32 v3, v9 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -5276,11 +5277,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5321,11 +5322,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -5738,11 +5739,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5783,11 +5784,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6172,11 +6173,11 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6214,11 +6215,11 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6616,11 +6617,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6659,11 +6660,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7063,11 +7064,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7106,11 +7107,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7414,10 +7415,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7449,10 +7450,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7747,10 +7748,10 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v5, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7781,10 +7782,10 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v3 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8198,11 +8199,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8243,11 +8244,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8652,11 +8653,11 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8695,11 +8696,11 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -13836,9 +13837,9 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -13886,9 +13887,9 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -14130,9 +14131,9 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -14180,9 +14181,9 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -14429,9 +14430,9 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -14481,9 +14482,9 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -15587,9 +15588,9 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -15637,9 +15638,9 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 7930ad8a15404..c4647bf10545f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s + ; -------------------------------------------------------------------- ; float ; -------------------------------------------------------------------- @@ -4210,11 +4211,11 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_min_f64 v[6:7], v[0:1], v[10:11] +; GFX7-NEXT: v_mov_b32_e32 v3, v9 ; GFX7-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 -; GFX7-NEXT: v_mov_b32_e32 v3, v9 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -4244,11 +4245,11 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_min_f64 v[6:7], v[0:1], v[10:11] +; GFX6-NEXT: v_mov_b32_e32 v3, v9 ; GFX6-NEXT: v_mov_b32_e32 v0, v6 ; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 -; GFX6-NEXT: v_mov_b32_e32 v3, v9 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -5276,11 +5277,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5321,11 +5322,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -5738,11 +5739,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5783,11 +5784,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6172,11 +6173,11 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6214,11 +6215,11 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6616,11 +6617,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6659,11 +6660,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7063,11 +7064,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7106,11 +7107,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7414,10 +7415,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7449,10 +7450,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7747,10 +7748,10 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v5, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7781,10 +7782,10 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v3 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8198,11 +8199,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8243,11 +8244,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8652,11 +8653,11 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8695,11 +8696,11 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -13836,9 +13837,9 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -13886,9 +13887,9 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -14130,9 +14131,9 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -14180,9 +14181,9 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -14429,9 +14430,9 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -14481,9 +14482,9 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -15587,9 +15588,9 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -15637,9 +15638,9 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index b79d0df960a0f..b81e4872a2057 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s + ; -------------------------------------------------------------------- ; float ; -------------------------------------------------------------------- @@ -3921,9 +3922,9 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], -v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v0, v8 ; GFX7-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v3, v11 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3956,9 +3957,9 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], -v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v0, v8 ; GFX6-NEXT: v_mov_b32_e32 v1, v9 -; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v3, v11 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4173,9 +4174,9 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], -v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v0, v8 ; GFX7-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v3, v11 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4208,9 +4209,9 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], -v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v0, v8 ; GFX6-NEXT: v_mov_b32_e32 v1, v9 -; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v3, v11 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4428,9 +4429,9 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v3, v9 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[10:11], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4465,9 +4466,9 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v0, v6 ; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v3, v9 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[10:11], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -5983,11 +5984,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6028,11 +6029,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_sub_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6420,11 +6421,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6465,11 +6466,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_sub_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6834,11 +6835,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6876,11 +6877,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_sub_f32_e32 v3, v3, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7253,11 +7254,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7296,11 +7297,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_sub_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7675,11 +7676,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7718,11 +7719,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_sub_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8006,10 +8007,10 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8041,10 +8042,10 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_sub_f32_e32 v2, v2, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8319,10 +8320,10 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v5, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8353,10 +8354,10 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v3 ; GFX6-NEXT: v_sub_f32_e32 v2, v2, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8745,11 +8746,11 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8790,11 +8791,11 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_sub_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9174,11 +9175,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9217,11 +9218,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_sub_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -14321,9 +14322,9 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -14371,9 +14372,9 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -14598,9 +14599,9 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -14648,9 +14649,9 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -14880,9 +14881,9 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -14932,9 +14933,9 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -15964,9 +15965,9 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -16014,9 +16015,9 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 890ebddf36801..0524b284a3d05 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -4793,8 +4793,8 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX12-GISEL: ; %bb.0: ; %bb ; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-GISEL-NEXT: .LBB132_1: ; %bb3 ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4913,8 +4913,8 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX12-GISEL: ; %bb.0: ; %bb ; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-GISEL-NEXT: .LBB133_1: ; %bb3 ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index da132d0269e6b..8158d28181ab0 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -5321,9 +5321,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5345,9 +5345,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; GFX11-LABEL: atomic_cmpxchg_i32_offset: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv @@ -5387,10 +5387,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out, ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s2 -; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_mov_b32 s2, s6 ; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5417,9 +5417,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out, ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -5467,8 +5467,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5556,8 +5556,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) ; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5638,9 +5638,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5662,9 +5662,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; GFX11-LABEL: atomic_cmpxchg_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv @@ -5703,8 +5703,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5733,9 +5733,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -5780,8 +5780,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s2, s0 ; VI-NEXT: s_addc_u32 s1, s3, s1 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5866,8 +5866,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s0, s0, s4 ; VI-NEXT: s_addc_u32 s1, s1, s5 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index fb5e669d680f5..59de06b9c53bf 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -1812,8 +1812,8 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB36_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2597,8 +2597,8 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB47_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3364,8 +3364,8 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB57_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4208,8 +4208,8 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %p ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB67_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4993,8 +4993,8 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB78_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5778,8 +5778,8 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB89_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5958,8 +5958,8 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_load_dword s3, s[4:5], 0x10 ; VI-NEXT: s_add_u32 s4, s4, 16 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -6064,8 +6064,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_load_dword s7, s[4:5], 0x10 ; VI-NEXT: s_add_u32 s4, s4, 16 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -6169,8 +6169,8 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; VI-NEXT: s_add_u32 s4, s0, s4 ; VI-NEXT: s_addc_u32 s5, s1, s5 ; VI-NEXT: s_load_dword s3, s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -6272,8 +6272,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_add_u32 s4, s0, s4 ; VI-NEXT: s_addc_u32 s5, s1, s5 ; VI-NEXT: s_load_dword s7, s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 @@ -6946,8 +6946,8 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB103_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7126,8 +7126,8 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: s_load_dword s3, s[4:5], 0x10 ; VI-NEXT: s_add_u32 s4, s4, 16 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -7232,8 +7232,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: s_load_dword s7, s[4:5], 0x10 ; VI-NEXT: s_add_u32 s4, s4, 16 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -7345,8 +7345,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_add_u32 s4, s0, s4 ; VI-NEXT: s_addc_u32 s5, s1, s5 ; VI-NEXT: s_load_dword s7, s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 @@ -8019,8 +8019,8 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB116_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8767,8 +8767,8 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB126_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8947,8 +8947,8 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_load_dword s3, s[4:5], 0x10 ; VI-NEXT: s_add_u32 s4, s4, 16 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -9053,8 +9053,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_load_dword s7, s[4:5], 0x10 ; VI-NEXT: s_add_u32 s4, s4, 16 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -9248,8 +9248,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_add_u32 s4, s0, s4 ; VI-NEXT: s_addc_u32 s5, s1, s5 ; VI-NEXT: s_load_dword s7, s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 @@ -9960,8 +9960,8 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) i ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB140_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10579,9 +10579,9 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v1, v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB148_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10784,9 +10784,9 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB150_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll index daa053ceea161..022b226aa7704 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -140,10 +140,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -217,11 +217,11 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -428,10 +428,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -502,11 +502,11 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -708,10 +708,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -785,11 +785,11 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -996,10 +996,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1070,11 +1070,11 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1276,10 +1276,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1353,11 +1353,11 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1564,10 +1564,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1638,11 +1638,11 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1835,10 +1835,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_endpgm @@ -1906,11 +1906,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2105,10 +2105,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm @@ -2173,11 +2173,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2367,10 +2367,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_endpgm @@ -2438,11 +2438,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2637,10 +2637,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm @@ -2705,11 +2705,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2899,10 +2899,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_endpgm @@ -2970,11 +2970,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3169,10 +3169,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm @@ -3237,11 +3237,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3431,10 +3431,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_endpgm @@ -3502,11 +3502,11 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3701,10 +3701,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm @@ -3769,11 +3769,11 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3972,10 +3972,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4049,11 +4049,11 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4260,10 +4260,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4334,11 +4334,11 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4650,10 +4650,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4727,11 +4727,11 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4938,10 +4938,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -5012,11 +5012,11 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -5218,10 +5218,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -5295,11 +5295,11 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -5506,10 +5506,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -5580,11 +5580,11 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -5878,12 +5878,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: s_mov_b64 s[8:9], s[0:1] ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_mov_b32_e32 v2, s6 ; CI-NEXT: v_mov_b32_e32 v3, s7 +; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_mov_b32_e32 v5, s5 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[8:11], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -5899,11 +5899,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5957,7 +5957,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshl_b64 s[6:7], s[14:15], 3 -; CI-NEXT: v_mov_b32_e32 v4, s6 ; CI-NEXT: s_mov_b32 s0, s10 ; CI-NEXT: s_mov_b32 s1, s11 ; CI-NEXT: s_mov_b32 s10, 0 @@ -5966,6 +5965,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; CI-NEXT: v_mov_b32_e32 v1, s13 ; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: v_mov_b32_e32 v4, s6 ; CI-NEXT: v_mov_b32_e32 v5, s7 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[8:11], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -5984,10 +5984,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; VI-NEXT: s_addc_u32 s3, s9, s3 ; VI-NEXT: s_add_u32 s2, s0, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6198,11 +6198,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[8:9], s[0:1] ; CI-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_mov_b32_e32 v2, s6 ; CI-NEXT: v_mov_b32_e32 v3, s7 +; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[8:11], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -6216,11 +6216,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; VI-NEXT: s_add_u32 s0, s0, s4 ; VI-NEXT: s_addc_u32 s1, s1, s5 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6273,7 +6273,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshl_b64 s[6:7], s[14:15], 3 -; CI-NEXT: v_mov_b32_e32 v4, s6 ; CI-NEXT: s_mov_b32 s0, s10 ; CI-NEXT: s_mov_b32 s1, s11 ; CI-NEXT: s_mov_b32 s10, 0 @@ -6282,6 +6281,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v1, s13 ; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: v_mov_b32_e32 v4, s6 ; CI-NEXT: v_mov_b32_e32 v5, s7 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[8:11], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -6297,11 +6297,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; VI-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 ; VI-NEXT: s_add_u32 s2, s8, s2 ; VI-NEXT: s_addc_u32 s3, s9, s3 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6555,10 +6555,10 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s4, s2 ; CI-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 -; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -6636,10 +6636,10 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s4, s2 ; CI-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 -; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -6714,10 +6714,10 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s4, s2 ; CI-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 -; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -6899,8 +6899,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 ; CI-NEXT: s_endpgm @@ -6969,8 +6969,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_lshl_b64 s[0:1], s[8:9], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm @@ -7036,8 +7036,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 ; CI-NEXT: s_endpgm @@ -7231,10 +7231,10 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_inc_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -7434,10 +7434,10 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_dec_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index 6a4c2849ba4a3..66edbec65b56b 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s + ; --------------------------------------------------------------------- ; atomicrmw xchg ; --------------------------------------------------------------------- @@ -234,9 +235,9 @@ define amdgpu_gfx void @global_atomic_xchg_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -343,9 +344,9 @@ define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -672,9 +673,9 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -781,9 +782,9 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_offset_scalar(ptr addrspace ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1110,9 +1111,9 @@ define amdgpu_gfx void @global_atomic_add_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1219,9 +1220,9 @@ define amdgpu_gfx i64 @global_atomic_add_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1536,8 +1537,8 @@ define i64 @global_atomic_sub_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: .LBB32_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 ; VI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -1561,8 +1562,8 @@ define i64 @global_atomic_sub_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc @@ -1629,8 +1630,8 @@ define i64 @global_atomic_sub_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: .LBB33_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 ; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_mov_b32_e32 v9, v1 ; VI-NEXT: v_sub_u32_e32 v6, vcc, v8, v2 ; VI-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc @@ -1652,8 +1653,8 @@ define i64 @global_atomic_sub_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc @@ -1725,9 +1726,9 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB34_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1929,15 +1930,15 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB36_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v8, v1 ; VI-NEXT: v_mov_b32_e32 v7, v0 +; VI-NEXT: v_mov_b32_e32 v8, v1 ; VI-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 ; VI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -1961,8 +1962,8 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v6 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[4:7], s[4:5] glc @@ -2037,8 +2038,8 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: .LBB37_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v8, v1 ; VI-NEXT: v_mov_b32_e32 v7, v0 +; VI-NEXT: v_mov_b32_e32 v8, v1 ; VI-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 ; VI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -2062,8 +2063,8 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v6 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[4:7], s[4:5] offset:32 glc @@ -2562,8 +2563,8 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB44_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2760,8 +2761,8 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB46_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3412,8 +3413,8 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB54_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3622,8 +3623,8 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB56_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4375,8 +4376,8 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB64_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4573,8 +4574,8 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %p ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB66_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5201,8 +5202,8 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB74_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5399,8 +5400,8 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB76_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6042,10 +6043,10 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB84_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6259,10 +6260,10 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB86_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6474,12 +6475,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v6, s3 +; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: .LBB88_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6593,12 +6594,12 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: .LBB89_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6712,10 +6713,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; VI-NEXT: s_add_u32 s4, s0, s4 ; VI-NEXT: s_addc_u32 s5, s1, s5 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: v_mov_b32_e32 v6, s3 ; VI-NEXT: v_mov_b32_e32 v7, s2 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -6828,10 +6829,10 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_add_u32 s6, s0, s6 ; VI-NEXT: s_addc_u32 s7, s1, s7 ; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: v_mov_b32_e32 v3, s9 @@ -7394,10 +7395,10 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB98_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7611,10 +7612,10 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB100_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7826,12 +7827,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v6, s3 +; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: .LBB102_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7945,12 +7946,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: .LBB103_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8068,10 +8069,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_add_u32 s6, s0, s6 ; VI-NEXT: s_addc_u32 s7, s1, s7 ; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: v_mov_b32_e32 v3, s9 @@ -8634,10 +8635,10 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB111_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8851,10 +8852,10 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB113_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9510,10 +9511,10 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB121_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9727,10 +9728,10 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB123_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9942,12 +9943,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v6, s3 +; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: .LBB125_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10061,12 +10062,12 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: .LBB126_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10175,9 +10176,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v6, s3 ; VI-NEXT: v_mov_b32_e32 v7, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -10285,10 +10286,10 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_add_u32 s6, s0, s6 ; VI-NEXT: s_addc_u32 s7, s1, s7 ; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: v_mov_b32_e32 v3, s9 @@ -10668,8 +10669,8 @@ define i64 @global_atomic_uinc_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: .LBB133_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] @@ -10696,8 +10697,8 @@ define i64 @global_atomic_uinc_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: .LBB133_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] @@ -10769,8 +10770,8 @@ define i64 @global_atomic_uinc_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i ; VI-NEXT: .LBB134_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 ; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_mov_b32_e32 v9, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc ; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] @@ -10795,8 +10796,8 @@ define i64 @global_atomic_uinc_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i ; GFX9-NEXT: .LBB134_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] @@ -10873,8 +10874,8 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB135_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11089,14 +11090,14 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB137_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v1 ; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_mov_b32_e32 v7, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -11122,8 +11123,8 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i ; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[5:6] @@ -11202,8 +11203,8 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa ; VI-NEXT: .LBB138_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v1 ; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_mov_b32_e32 v7, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -11229,8 +11230,8 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa ; GFX9-NEXT: .LBB138_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[5:6] @@ -11797,10 +11798,10 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[38:39], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB145_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12037,10 +12038,10 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) i ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[38:39], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB147_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index a6a886dc321ce..70fe85b1c1ebb 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -452,14 +452,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -793,14 +793,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1665,14 +1665,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2006,14 +2006,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2938,14 +2938,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3279,14 +3279,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3707,14 +3707,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4048,14 +4048,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4979,14 +4979,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5346,14 +5346,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5759,8 +5759,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -5792,10 +5792,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -5826,10 +5826,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -5973,8 +5973,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -6006,10 +6006,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -6040,10 +6040,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -6236,14 +6236,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6620,14 +6620,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -7663,14 +7663,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -8047,14 +8047,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9090,14 +9090,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9474,14 +9474,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9999,14 +9999,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -10383,14 +10383,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -11426,14 +11426,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -11810,14 +11810,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index e62d6c593215b..a6b4679dbfb8b 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -364,14 +364,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -696,14 +696,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1397,14 +1397,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1729,14 +1729,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2430,14 +2430,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2762,14 +2762,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3549,14 +3549,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3929,14 +3929,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4813,14 +4813,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5193,14 +5193,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6077,14 +6077,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6457,14 +6457,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 1c14ff65dcbb6..10e4448c88797 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -364,14 +364,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -696,14 +696,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1397,14 +1397,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1729,14 +1729,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2430,14 +2430,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2762,14 +2762,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3549,14 +3549,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3929,14 +3929,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4813,14 +4813,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5193,14 +5193,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6077,14 +6077,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6457,14 +6457,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index b97c3cdf32d12..7f689ec0e4ed8 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -512,14 +512,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -879,14 +879,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1837,14 +1837,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2204,14 +2204,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3162,14 +3162,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3529,14 +3529,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3983,14 +3983,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4350,14 +4350,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5307,14 +5307,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5674,14 +5674,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6087,8 +6087,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -6120,10 +6120,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -6154,10 +6154,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -6301,8 +6301,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -6334,10 +6334,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -6368,10 +6368,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -6564,14 +6564,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6948,14 +6948,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -7990,14 +7990,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -8374,14 +8374,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9417,14 +9417,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9801,14 +9801,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -10326,14 +10326,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -10710,14 +10710,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -11752,14 +11752,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -12136,14 +12136,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir index da1175c02e94a..b0c26724797d0 100644 --- a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir @@ -86,9 +86,9 @@ body: | ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr64_sgpr65:0x000000000000000F ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 + ; CHECK-NEXT: [[COPY_LANEMASK:%[0-9]+]]:vreg_1024 = COPY_LANEMASK renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99, lanemask(0x000003FFFFFFFFFF) ; CHECK-NEXT: renamable $sgpr6 = S_LSHL_B32 renamable $sgpr65, 1, implicit-def dead $scc - ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:vreg_1024 = V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 [[COPY]], 0, killed $sgpr6, 3, implicit-def $m0, implicit $m0, implicit $exec + ; CHECK-NEXT: dead [[COPY_LANEMASK:%[0-9]+]]:vreg_1024 = V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 [[COPY_LANEMASK]], 0, killed $sgpr6, 3, implicit-def $m0, implicit $m0, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.5(0x40000000), %bb.1(0x40000000) @@ -117,7 +117,7 @@ body: | ; CHECK-NEXT: renamable $sgpr55 = COPY renamable $sgpr68 ; CHECK-NEXT: renamable $sgpr56 = COPY renamable $sgpr68 ; CHECK-NEXT: renamable $sgpr57 = COPY killed renamable $sgpr68 - ; CHECK-NEXT: dead [[COPY1:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, implicit $exec + ; CHECK-NEXT: dead [[COPY_LANEMASK1:%[0-9]+]]:vreg_1024 = COPY_LANEMASK renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, lanemask(0x00000FFFFFFFFFFF), implicit $exec ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, killed renamable $sgpr6_sgpr7, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.1 @@ -129,7 +129,7 @@ body: | ; CHECK-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr6_sgpr7, implicit-def $scc ; CHECK-NEXT: dead renamable $sgpr4 = S_LSHL_B32 killed renamable $sgpr64, 1, implicit-def dead $scc ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: dead [[COPY2:%[0-9]+]]:vreg_1024 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 + ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:vreg_1024 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 8e427a6ef2023..c6c021e6e89d5 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -100,10 +100,10 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_add_u32 s4, s0, 4 ; CIVI-NEXT: s_addc_u32 s5, s1, 0 -; CIVI-NEXT: v_mov_b32_e32 v2, s4 ; CIVI-NEXT: v_mov_b32_e32 v4, s3 -; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v2, s4 ; CIVI-NEXT: v_mov_b32_e32 v3, s5 +; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 ; CIVI-NEXT: v_mov_b32_e32 v5, s2 ; CIVI-NEXT: flat_store_short v[2:3], v4 @@ -135,8 +135,8 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: v_mov_b32_e32 v2, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: v_mov_b32_e32 v2, s2 ; CIVI-NEXT: v_mov_b32_e32 v3, s3 ; CIVI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CIVI-NEXT: s_endpgm @@ -144,9 +144,9 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; GFX11-LABEL: load_v4f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm store <4 x half> %arg, ptr addrspace(1) %out @@ -163,8 +163,8 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v4, s4 -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v5, s5 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 @@ -180,8 +180,8 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -193,10 +193,10 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_endpgm store <8 x half> %arg, ptr addrspace(1) %out @@ -461,19 +461,19 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s1, 16 -; CI-NEXT: s_lshr_b32 s7, s0, 16 ; CI-NEXT: s_lshr_b32 s8, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s6 ; CI-NEXT: s_lshr_b32 s6, s2, 16 +; CI-NEXT: s_lshr_b32 s7, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v7, s8 ; CI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: v_cvt_f32_f16_e32 v6, s3 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; CI-NEXT: s_add_u32 s0, s4, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: s_add_u32 s0, s4, 16 ; CI-NEXT: s_addc_u32 s1, s5, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; CI-NEXT: v_mov_b32_e32 v9, s1 ; CI-NEXT: v_mov_b32_e32 v8, s0 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] @@ -492,19 +492,19 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s1, 16 -; VI-NEXT: s_lshr_b32 s7, s0, 16 ; VI-NEXT: s_lshr_b32 s8, s3, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v3, s6 ; VI-NEXT: s_lshr_b32 s6, s2, 16 +; VI-NEXT: s_lshr_b32 s7, s0, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v7, s8 ; VI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; VI-NEXT: v_cvt_f32_f16_e32 v6, s3 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; VI-NEXT: s_add_u32 s0, s4, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s4, 16 ; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_mov_b32_e32 v8, s0 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] @@ -659,15 +659,15 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; CI-NEXT: s_lshr_b32 s4, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s2 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v0 -; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_mov_b32_e32 v7, s3 ; CI-NEXT: v_mov_b32_e32 v6, s2 ; CI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] @@ -683,15 +683,15 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 ; VI-NEXT: s_lshr_b32 s4, s2, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 -; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] @@ -732,17 +732,17 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 +; CI-NEXT: s_lshr_b32 s5, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; CI-NEXT: s_lshr_b32 s5, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s2 ; CI-NEXT: v_cvt_f32_f16_e32 v6, s5 -; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_mov_b32_e32 v9, s3 ; CI-NEXT: v_mov_b32_e32 v8, s2 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -760,17 +760,17 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s3, 16 +; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s2 ; VI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v9, s3 ; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -820,32 +820,33 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; CI-NEXT: v_cvt_f32_f16_e32 v12, s3 ; CI-NEXT: s_lshr_b32 s7, s2, 16 ; CI-NEXT: s_lshr_b32 s8, s1, 16 -; CI-NEXT: s_lshr_b32 s6, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; CI-NEXT: v_cvt_f32_f16_e32 v8, s2 ; CI-NEXT: v_cvt_f32_f16_e32 v9, s0 -; CI-NEXT: s_add_u32 s0, s4, 48 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; CI-NEXT: s_lshr_b32 s6, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v0 ; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 +; CI-NEXT: s_add_u32 s0, s4, 48 ; CI-NEXT: s_addc_u32 s1, s5, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; CI-NEXT: v_mov_b32_e32 v17, s1 -; CI-NEXT: v_mov_b32_e32 v16, s0 -; CI-NEXT: s_add_u32 s0, s4, 32 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v1 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; CI-NEXT: s_addc_u32 s1, s5, 0 -; CI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; CI-NEXT: v_mov_b32_e32 v16, s0 +; CI-NEXT: s_add_u32 s0, s4, 32 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 -; CI-NEXT: v_mov_b32_e32 v13, s1 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; CI-NEXT: v_mov_b32_e32 v17, s1 +; CI-NEXT: s_addc_u32 s1, s5, 0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; CI-NEXT: s_nop 0 ; CI-NEXT: v_mov_b32_e32 v12, s0 ; CI-NEXT: s_add_u32 s0, s4, 16 +; CI-NEXT: v_mov_b32_e32 v13, s1 ; CI-NEXT: s_addc_u32 s1, s5, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; CI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; CI-NEXT: s_nop 0 ; CI-NEXT: v_mov_b32_e32 v9, s1 @@ -865,37 +866,38 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s6, s0, 16 ; VI-NEXT: s_lshr_b32 s8, s2, 16 +; VI-NEXT: s_lshr_b32 s6, s0, 16 ; VI-NEXT: s_lshr_b32 s9, s3, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s6 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s6 ; VI-NEXT: v_cvt_f32_f16_e32 v5, s9 ; VI-NEXT: v_cvt_f32_f16_e32 v12, s3 ; VI-NEXT: s_lshr_b32 s7, s1, 16 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; VI-NEXT: v_cvt_f32_f16_e32 v8, s2 -; VI-NEXT: s_add_u32 s0, s4, 48 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v4 -; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v5 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s1 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 +; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v5 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 +; VI-NEXT: s_add_u32 s0, s4, 48 ; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v17, s1 +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 ; VI-NEXT: v_mov_b32_e32 v16, s0 ; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v1 -; VI-NEXT: v_mov_b32_e32 v13, s1 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; VI-NEXT: v_mov_b32_e32 v17, s1 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-NEXT: s_nop 0 ; VI-NEXT: v_mov_b32_e32 v12, s0 ; VI-NEXT: s_add_u32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v13, s1 ; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-NEXT: s_nop 0 ; VI-NEXT: v_mov_b32_e32 v9, s1 @@ -1477,10 +1479,10 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 16 -; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: s_addc_u32 s5, s3, 0 -; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -1490,6 +1492,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; CI-NEXT: v_mov_b32_e32 v13, s2 ; CI-NEXT: s_add_u32 s2, s0, 48 ; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v15, s3 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f32_f16_e32 v8, v1 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1511,19 +1514,18 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: s_add_u32 s0, s0, 32 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_mov_b32_e32 v15, s3 -; CI-NEXT: v_mov_b32_e32 v17, s1 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_mov_b32_e32 v17, s1 ; CI-NEXT: v_mov_b32_e32 v16, s0 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] @@ -1550,10 +1552,10 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; VI-NEXT: v_mov_b32_e32 v19, s3 ; VI-NEXT: v_mov_b32_e32 v18, s2 ; VI-NEXT: s_add_u32 s2, s0, 48 -; VI-NEXT: v_mov_b32_e32 v17, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v16, s0 ; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: v_mov_b32_e32 v17, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v21, s3 ; VI-NEXT: v_mov_b32_e32 v20, s2 @@ -1842,6 +1844,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v9, s3 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1851,7 +1854,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; VI-NEXT: v_mov_b32_e32 v9, s3 ; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: flat_store_dwordx2 v[8:9], v[6:7] ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1917,6 +1919,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v11, s3 ; CI-NEXT: v_mov_b32_e32 v9, s1 ; CI-NEXT: v_mov_b32_e32 v8, s0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1930,7 +1933,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v2 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10 -; CI-NEXT: v_mov_b32_e32 v11, s3 ; CI-NEXT: v_mov_b32_e32 v10, s2 ; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -1948,6 +1950,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v11, s3 ; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_mov_b32_e32 v8, s0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1959,7 +1962,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10 -; VI-NEXT: v_mov_b32_e32 v11, s3 ; VI-NEXT: v_mov_b32_e32 v10, s2 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -2036,13 +2038,13 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v7, s3 ; CI-NEXT: v_mov_b32_e32 v6, s2 ; CI-NEXT: s_add_u32 s2, s0, 32 -; CI-NEXT: v_mov_b32_e32 v13, s1 ; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_mov_b32_e32 v12, s0 ; CI-NEXT: s_add_u32 s0, s0, 16 +; CI-NEXT: v_mov_b32_e32 v13, s1 ; CI-NEXT: v_mov_b32_e32 v15, s3 -; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 @@ -2087,13 +2089,13 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v8, s3 ; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 -; VI-NEXT: v_mov_b32_e32 v13, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v12, s0 ; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: v_mov_b32_e32 v13, s1 ; VI-NEXT: v_mov_b32_e32 v15, s3 -; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v14, s2 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 ; VI-NEXT: v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -2220,31 +2222,30 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_mov_b32_e32 v19, s3 ; CI-NEXT: v_mov_b32_e32 v18, s2 -; CI-NEXT: s_add_u32 s2, s0, 0x70 -; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_mov_b32_e32 v13, s1 ; CI-NEXT: v_mov_b32_e32 v12, s0 +; CI-NEXT: s_add_u32 s2, s0, 0x70 +; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v21, v5 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 ; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v5 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 @@ -2257,33 +2258,34 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; CI-NEXT: s_add_u32 s2, s0, 0x60 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; CI-NEXT: v_mov_b32_e32 v17, s3 +; CI-NEXT: v_mov_b32_e32 v14, s2 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; CI-NEXT: s_add_u32 s2, s0, 0x60 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v5 +; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_mov_b32_e32 v16, s2 ; CI-NEXT: s_add_u32 s2, s0, 0x50 -; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; CI-NEXT: s_add_u32 s0, s0, 64 +; CI-NEXT: v_mov_b32_e32 v17, s3 +; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] -; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: s_add_u32 s0, s0, 64 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 +; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v19, s3 -; CI-NEXT: v_mov_b32_e32 v13, s1 ; CI-NEXT: v_mov_b32_e32 v18, s2 +; CI-NEXT: v_mov_b32_e32 v13, s1 ; CI-NEXT: v_mov_b32_e32 v12, s0 ; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] @@ -2317,10 +2319,10 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v18, s3 ; VI-NEXT: v_mov_b32_e32 v17, s2 -; VI-NEXT: s_add_u32 s2, s0, 0x50 ; VI-NEXT: v_mov_b32_e32 v12, s1 -; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v11, s0 +; VI-NEXT: s_add_u32 s2, s0, 0x50 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -2332,22 +2334,22 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-NEXT: v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; VI-NEXT: v_mov_b32_e32 v14, s3 +; VI-NEXT: v_mov_b32_e32 v13, s2 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 ; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; VI-NEXT: v_mov_b32_e32 v13, s2 ; VI-NEXT: s_add_u32 s2, s0, 64 +; VI-NEXT: v_mov_b32_e32 v14, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] -; VI-NEXT: v_mov_b32_e32 v16, s3 +; VI-NEXT: v_mov_b32_e32 v15, s2 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 ; VI-NEXT: v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v4 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 -; VI-NEXT: v_mov_b32_e32 v15, s2 ; VI-NEXT: s_add_u32 s2, s0, 0x70 +; VI-NEXT: v_mov_b32_e32 v16, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] ; VI-NEXT: v_cvt_f32_f16_sdwa v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -2365,15 +2367,15 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v10 ; VI-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 ; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 -; VI-NEXT: s_add_u32 s0, s0, 0x60 ; VI-NEXT: flat_store_dwordx4 v[13:14], v[1:4] -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s0, 0x60 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 ; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v20, s3 -; VI-NEXT: v_mov_b32_e32 v14, s1 ; VI-NEXT: v_mov_b32_e32 v19, s2 +; VI-NEXT: v_mov_b32_e32 v14, s1 ; VI-NEXT: v_mov_b32_e32 v13, s0 ; VI-NEXT: flat_store_dwordx4 v[15:16], v[9:12] ; VI-NEXT: flat_store_dwordx4 v[19:20], v[0:3] @@ -2646,8 +2648,8 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: flat_store_short v[0:1], v2 -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_or_b32_e32 v2, v4, v3 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -2665,14 +2667,14 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; VI-NEXT: s_add_u32 s2, s0, 4 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; VI-NEXT: v_cvt_f16_f32_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_endpgm @@ -2811,8 +2813,8 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_add_u32 s2, s2, 16 +; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_mov_b32_e32 v4, s2 @@ -2849,8 +2851,8 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 @@ -2943,15 +2945,15 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; CI-NEXT: s_add_u32 s4, s2, 32 ; CI-NEXT: s_addc_u32 s5, s3, 0 ; CI-NEXT: v_mov_b32_e32 v0, s4 -; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_add_u32 s4, s2, 48 +; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_addc_u32 s5, s3, 0 -; CI-NEXT: v_mov_b32_e32 v9, s3 -; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_mov_b32_e32 v8, s2 ; CI-NEXT: s_add_u32 s2, s2, 16 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_mov_b32_e32 v5, s5 +; CI-NEXT: v_mov_b32_e32 v9, s3 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; CI-NEXT: v_mov_b32_e32 v13, s3 @@ -2964,10 +2966,11 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: s_waitcnt vmcnt(2) ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_cvt_f16_f32_e32 v16, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; CI-NEXT: v_cvt_f16_f32_e32 v17, v4 ; CI-NEXT: s_waitcnt vmcnt(1) @@ -2980,12 +2983,11 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 ; CI-NEXT: v_or_b32_e32 v1, v2, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 ; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: v_or_b32_e32 v0, v0, v18 ; CI-NEXT: v_or_b32_e32 v3, v6, v2 @@ -3014,14 +3016,14 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; VI-NEXT: s_add_u32 s4, s2, 32 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_add_u32 s4, s2, 48 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v9, s3 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v9, s3 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -3379,50 +3381,50 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; CI-NEXT: s_lshr_b32 s0, s4, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v8, s0 ; CI-NEXT: s_lshr_b32 s0, s5, 16 -; CI-NEXT: s_lshr_b32 s11, s1, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v9, s0 ; CI-NEXT: s_lshr_b32 s0, s6, 16 +; CI-NEXT: s_lshr_b32 s11, s1, 16 ; CI-NEXT: s_lshr_b32 s12, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s10 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s11 ; CI-NEXT: s_lshr_b32 s10, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v10, s0 ; CI-NEXT: s_lshr_b32 s0, s7, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s11 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s12 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s10 -; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v11, s0 -; CI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; CI-NEXT: v_cvt_f32_f16_e32 v13, s5 +; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v6, s2 ; CI-NEXT: v_cvt_f32_f16_e32 v7, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; CI-NEXT: v_cvt_f32_f16_e32 v13, s5 ; CI-NEXT: v_cvt_f32_f16_e32 v14, s7 ; CI-NEXT: v_cvt_f32_f16_e32 v15, s6 -; CI-NEXT: v_add_f32_e32 v1, v1, v9 -; CI-NEXT: v_add_f32_e32 v0, v0, v8 ; CI-NEXT: v_add_f32_e32 v3, v3, v11 ; CI-NEXT: v_add_f32_e32 v2, v2, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v5, v5, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_add_f32_e32 v4, v4, v12 +; CI-NEXT: v_add_f32_e32 v1, v1, v9 +; CI-NEXT: v_add_f32_e32 v0, v0, v8 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_add_f32_e32 v7, v7, v14 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_add_f32_e32 v6, v6, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v5, v5, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_add_f32_e32 v4, v4, v12 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_or_b32_e32 v3, v7, v3 +; CI-NEXT: v_or_b32_e32 v2, v6, v2 ; CI-NEXT: v_or_b32_e32 v1, v5, v1 ; CI-NEXT: v_or_b32_e32 v0, v4, v0 ; CI-NEXT: v_mov_b32_e32 v4, s8 -; CI-NEXT: v_or_b32_e32 v3, v7, v3 -; CI-NEXT: v_or_b32_e32 v2, v6, v2 ; CI-NEXT: v_mov_b32_e32 v5, s9 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 76f204dd0c16a..29aedda49da70 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s + define void @main(i1 %arg) #0 { ; CHECK-LABEL: main: ; CHECK: ; %bb.0: ; %bb @@ -151,8 +152,8 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: ; %bb.5: ; %bb43 ; CHECK-NEXT: s_mov_b32 s16, 0 ; CHECK-NEXT: s_mov_b32 s17, s16 -; CHECK-NEXT: v_mov_b32_e32 v0, s16 ; CHECK-NEXT: v_readlane_b32 s44, v7, 16 +; CHECK-NEXT: v_mov_b32_e32 v0, s16 ; CHECK-NEXT: v_mov_b32_e32 v1, s17 ; CHECK-NEXT: s_mov_b32 s18, s16 ; CHECK-NEXT: s_mov_b32 s19, s16 @@ -203,10 +204,10 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b32 s16, 0 ; CHECK-NEXT: s_mov_b32 s20, s16 ; CHECK-NEXT: s_mov_b32 s21, s16 -; CHECK-NEXT: v_mov_b32_e32 v1, s20 ; CHECK-NEXT: s_mov_b32 s17, s16 ; CHECK-NEXT: s_mov_b32 s18, s16 ; CHECK-NEXT: s_mov_b32 s19, s16 +; CHECK-NEXT: v_mov_b32_e32 v1, s20 ; CHECK-NEXT: v_mov_b32_e32 v2, s21 ; CHECK-NEXT: image_sample_lz v3, v[1:2], s[8:15], s[16:19] dmask:0x1 ; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[16:19] dmask:0x1 diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll index 2daed9b69384f..23ce500e7b25b 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -286,11 +286,11 @@ define amdgpu_kernel void @llvm_ubsantrap() { define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_add_u32 s0, s8, 8 ; GFX8V4-NEXT: v_mov_b32_e32 v0, s6 ; GFX8V4-NEXT: v_mov_b32_e32 v1, s7 -; GFX8V4-NEXT: s_add_u32 s0, s8, 8 -; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V4-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V4-NEXT: s_waitcnt vmcnt(0) ; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V4-NEXT: v_mov_b32_e32 v1, s1 @@ -311,11 +311,11 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { ; ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_add_u32 s0, s8, 8 ; GFX8V5-NEXT: v_mov_b32_e32 v0, s6 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s7 -; GFX8V5-NEXT: s_add_u32 s0, s8, 8 -; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V5-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 8fcf1ad3fbc95..5ab2dcbedb537 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -2125,15 +2125,15 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 -; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-MOVREL-NEXT: s_nop 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-MOVREL-NEXT: s_nop 0 @@ -2174,15 +2174,15 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-IDXMODE-NEXT: s_nop 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-IDXMODE-NEXT: s_nop 0 @@ -2479,15 +2479,15 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16 -; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-MOVREL-NEXT: s_nop 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-MOVREL-NEXT: s_nop 0 @@ -2528,15 +2528,15 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-IDXMODE-NEXT: s_nop 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-IDXMODE-NEXT: s_nop 0 @@ -2837,15 +2837,15 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 -; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-MOVREL-NEXT: s_nop 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-MOVREL-NEXT: s_nop 0 @@ -2887,15 +2887,15 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-IDXMODE-NEXT: s_nop 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-IDXMODE-NEXT: s_nop 0 @@ -3192,15 +3192,15 @@ define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 -; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-MOVREL-NEXT: s_nop 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-MOVREL-NEXT: s_nop 0 @@ -3240,15 +3240,15 @@ define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-IDXMODE-NEXT: s_nop 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-IDXMODE-NEXT: s_nop 0 @@ -3509,7 +3509,6 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; VI-MOVREL-NEXT: s_add_i32 m0, s2, 0xfffffe00 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 4 ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 5 ; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 6 @@ -3525,12 +3524,13 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-MOVREL-NEXT: s_nop 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-MOVREL-NEXT: s_nop 0 @@ -3570,15 +3570,15 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-IDXMODE-NEXT: s_nop 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-IDXMODE-NEXT: s_nop 0 @@ -3595,6 +3595,7 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; GFX9-IDXMODE: ; %bb.0: ; %entry ; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x34 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 2 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 3 @@ -3610,10 +3611,9 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 13 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 14 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 15 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v15 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v15 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, v14 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, v13 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, v12 @@ -3826,7 +3826,6 @@ define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ; VI-MOVREL-NEXT: s_add_i32 m0, s2, 0xfffffe00 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s9 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s10 ; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s11 @@ -3845,12 +3844,13 @@ define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, 5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-MOVREL-NEXT: s_nop 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-MOVREL-NEXT: s_nop 0 @@ -3891,15 +3891,15 @@ define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 5 ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-IDXMODE-NEXT: s_nop 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-IDXMODE-NEXT: s_nop 0 @@ -4328,9 +4328,9 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12 ; VI-NEXT: v_cndmask_b32_e64 v6, 7, 33, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; VI-NEXT: v_cndmask_b32_e64 v5, 6, 33, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; VI-NEXT: v_cndmask_b32_e64 v4, 5, 33, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12 ; VI-NEXT: v_cndmask_b32_e64 v11, 12, 33, vcc @@ -4341,25 +4341,25 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 ; VI-NEXT: v_cndmask_b32_e64 v8, 9, 33, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: v_cndmask_b32_e64 v15, 16, 33, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12 -; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: v_cndmask_b32_e64 v14, 15, 33, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12 -; VI-NEXT: v_mov_b32_e32 v17, s3 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_cndmask_b32_e64 v13, 14, 33, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12 ; VI-NEXT: v_mov_b32_e32 v16, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 ; VI-NEXT: v_cndmask_b32_e64 v12, 13, 33, vcc +; VI-NEXT: v_mov_b32_e32 v17, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v13, s3 ; VI-NEXT: v_mov_b32_e32 v12, s2 ; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: v_mov_b32_e32 v13, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-NEXT: s_nop 0 @@ -4826,15 +4826,15 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_cndmask_b32_e32 v12, 13, v16, vcc -; VI-NEXT: v_mov_b32_e32 v17, s3 ; VI-NEXT: v_mov_b32_e32 v16, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: v_mov_b32_e32 v17, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v13, s3 ; VI-NEXT: v_mov_b32_e32 v12, s2 ; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: v_mov_b32_e32 v13, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-NEXT: s_nop 0 @@ -6399,16 +6399,16 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v19, s3 ; VI-NEXT: v_mov_b32_e32 v18, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: v_mov_b32_e32 v19, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[18:19], v[14:17] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v15, s3 ; VI-NEXT: v_mov_b32_e32 v14, s2 ; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: v_mov_b32_e32 v15, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6886,8 +6886,8 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32 -; SI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 ; SI-MOVREL-NEXT: s_add_i32 m0, s2, 2 +; SI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 ; SI-MOVREL-NEXT: v_mov_b32_e32 v30, v14 ; SI-MOVREL-NEXT: v_mov_b32_e32 v29, v13 ; SI-MOVREL-NEXT: v_mov_b32_e32 v28, v12 @@ -6941,8 +6941,8 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32 ; VI-MOVREL-NEXT: s_add_i32 m0, s2, 2 -; VI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 +; VI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 ; VI-MOVREL-NEXT: v_mov_b32_e32 v30, v14 ; VI-MOVREL-NEXT: v_mov_b32_e32 v29, v13 ; VI-MOVREL-NEXT: v_mov_b32_e32 v28, v12 @@ -6960,38 +6960,39 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, v0 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v16, v32 -; VI-MOVREL-NEXT: v_mov_b32_e32 v33, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v32, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v33, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[32:33], v[12:15] ; VI-MOVREL-NEXT: s_nop 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-MOVREL-NEXT: s_nop 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 64 -; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x70 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 -; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x70 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0 +; VI-MOVREL-NEXT: s_nop 0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 -; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 ; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x60 -; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[28:31] +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 ; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0 +; VI-MOVREL-NEXT: s_add_u32 s0, s0, 0x50 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[28:31] ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 -; VI-MOVREL-NEXT: s_add_u32 s0, s0, 0x50 -; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; VI-MOVREL-NEXT: s_addc_u32 s1, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[20:23] @@ -7026,8 +7027,8 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-IDXMODE-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v32 ; VI-IDXMODE-NEXT: s_set_gpr_idx_off -; VI-IDXMODE-NEXT: v_mov_b32_e32 v31, v15 ; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v31, v15 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v30, v14 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v29, v13 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v28, v12 @@ -7048,38 +7049,39 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v33, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v32, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v33, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[32:33], v[12:15] ; VI-IDXMODE-NEXT: s_nop 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-IDXMODE-NEXT: s_nop 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 64 -; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x70 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 -; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x70 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0 +; VI-IDXMODE-NEXT: s_nop 0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 ; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x60 -; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[28:31] +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 ; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0 +; VI-IDXMODE-NEXT: s_add_u32 s0, s0, 0x50 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[28:31] ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 -; VI-IDXMODE-NEXT: s_add_u32 s0, s0, 0x50 -; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; VI-IDXMODE-NEXT: s_addc_u32 s1, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[20:23] @@ -7114,8 +7116,8 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v32 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v31, v15 ; GFX9-IDXMODE-NEXT: s_add_i32 s2, s2, 2 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v31, v15 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v30, v14 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v29, v13 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v28, v12 @@ -8003,13 +8005,13 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out ; VI-MOVREL-NEXT: s_add_u32 s4, s2, 48 ; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4 -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 ; VI-MOVREL-NEXT: s_add_u32 s4, s2, 32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 ; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0 +; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4 -; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16 ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 ; VI-MOVREL-NEXT: s_addc_u32 s3, s3, 0 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc @@ -8033,17 +8035,17 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out ; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 48 ; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 ; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 32 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 ; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3 +; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4 -; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s3, 0 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc @@ -8265,13 +8267,13 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p ; VI-MOVREL-NEXT: s_add_u32 s4, s2, 48 ; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4 -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 ; VI-MOVREL-NEXT: s_add_u32 s4, s2, 32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 ; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0 +; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4 -; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16 ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 ; VI-MOVREL-NEXT: s_addc_u32 s3, s3, 0 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc @@ -8295,17 +8297,17 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p ; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 48 ; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 ; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 32 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 ; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3 +; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4 -; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s3, 0 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc @@ -8528,13 +8530,13 @@ define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ; VI-MOVREL-NEXT: s_add_u32 s4, s2, 48 ; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4 -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 ; VI-MOVREL-NEXT: s_add_u32 s4, s2, 32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 ; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0 +; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4 -; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16 ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 ; VI-MOVREL-NEXT: s_addc_u32 s3, s3, 0 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc @@ -8558,17 +8560,17 @@ define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 48 ; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 ; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 32 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 ; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3 +; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4 -; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s3, 0 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc @@ -8835,15 +8837,15 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, ; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s23 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16 -; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-MOVREL-NEXT: s_nop 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-MOVREL-NEXT: s_nop 0 @@ -8885,15 +8887,15 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, ; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, v16 ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-IDXMODE-NEXT: s_nop 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-IDXMODE-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index e1b4cad370f96..1ce5ff51e6a5b 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -23,8 +23,8 @@ define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -68,11 +68,11 @@ define amdgpu_kernel void @int4_inselt(ptr addrspace(1) %out, <4 x i32> %vec, i3 ; GCN-NEXT: s_cselect_b32 s1, s1, 1 ; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cselect_b32 s0, s0, 1 -; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -95,8 +95,8 @@ define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm @@ -116,7 +116,6 @@ define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_add_u32 s2, s0, 16 -; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 @@ -124,8 +123,9 @@ define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec ; GCN-NEXT: v_mov_b32_e32 v5, s13 ; GCN-NEXT: v_mov_b32_e32 v6, s14 ; GCN-NEXT: v_mov_b32_e32 v7, s15 -; GCN-NEXT: v_mov_b32_e32 v9, s3 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 +; GCN-NEXT: v_mov_b32_e32 v9, s3 ; GCN-NEXT: v_mov_b32_e32 v8, s2 ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-NEXT: s_nop 0 @@ -149,7 +149,6 @@ define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %v ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: s_add_u32 s2, s0, 48 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 @@ -169,12 +168,13 @@ define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %v ; GCN-NEXT: v_mov_b32_e32 v16, s2 ; GCN-NEXT: s_add_u32 s2, s0, 32 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 +; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: v_mov_b32_e32 v12, s2 ; GCN-NEXT: s_add_u32 s2, s0, 16 +; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 @@ -204,7 +204,6 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v33, s3 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NEXT: v_mov_b32_e32 v3, s39 @@ -239,36 +238,37 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v ; GCN-NEXT: v_mov_b32_e32 v32, s2 ; GCN-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 +; GCN-NEXT: v_mov_b32_e32 v33, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[32:33], v[28:31] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v29, s3 ; GCN-NEXT: v_mov_b32_e32 v28, s2 ; GCN-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-NEXT: v_mov_b32_e32 v29, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[28:29], v[24:27] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v25, s3 ; GCN-NEXT: v_mov_b32_e32 v24, s2 ; GCN-NEXT: s_add_u32 s2, s0, 64 +; GCN-NEXT: v_mov_b32_e32 v25, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[24:25], v[20:23] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v21, s3 ; GCN-NEXT: v_mov_b32_e32 v20, s2 ; GCN-NEXT: s_add_u32 s2, s0, 48 +; GCN-NEXT: v_mov_b32_e32 v21, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: v_mov_b32_e32 v16, s2 ; GCN-NEXT: s_add_u32 s2, s0, 32 +; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: v_mov_b32_e32 v12, s2 ; GCN-NEXT: s_add_u32 s2, s0, 16 +; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 @@ -300,8 +300,8 @@ define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, ; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm @@ -428,8 +428,8 @@ define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, ; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm @@ -452,8 +452,8 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3 ; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm @@ -550,11 +550,11 @@ define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec, ; GCN-NEXT: s_or_b32 s0, s0, s8 ; GCN-NEXT: s_and_b32 s0, s0, 0xffff ; GCN-NEXT: s_or_b32 s0, s0, s7 -; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -577,11 +577,11 @@ define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %v ; GCN-NEXT: s_cmp_eq_u32 s6, 0 ; GCN-NEXT: s_cselect_b32 s1, 0x3ff00000, s1 ; GCN-NEXT: s_cselect_b32 s0, 0, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -618,13 +618,13 @@ define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %v ; GCN-NEXT: s_add_u32 s0, s10, 16 ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: s_addc_u32 s1, s11, 0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: s_add_u32 s0, s10, 32 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NEXT: s_add_u32 s0, s10, 32 ; GCN-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -633,9 +633,9 @@ define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %v ; GCN-NEXT: s_addc_u32 s1, s11, 0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm @@ -674,15 +674,15 @@ define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %v ; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: v_movreld_b32_e32 v1, v16 -; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: v_mov_b32_e32 v16, s2 ; GCN-NEXT: s_add_u32 s2, s0, 32 +; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: v_mov_b32_e32 v12, s2 ; GCN-NEXT: s_add_u32 s2, s0, 16 +; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 @@ -732,17 +732,18 @@ define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %v ; GCN-NEXT: s_addc_u32 s1, s7, 0 ; GCN-NEXT: v_mov_b32_e32 v15, s1 ; GCN-NEXT: v_mov_b32_e32 v14, s0 -; GCN-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GCN-NEXT: s_add_u32 s0, s6, 48 +; GCN-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GCN-NEXT: s_addc_u32 s1, s7, 0 ; GCN-NEXT: v_mov_b32_e32 v4, s6 ; GCN-NEXT: v_mov_b32_e32 v5, s7 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NEXT: s_addc_u32 s1, s7, 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_add_u32 s0, s6, 32 -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[12:13] +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_addc_u32 s1, s7, 0 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[12:13] ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx4 v[0:1], v[8:11] @@ -800,39 +801,39 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> ; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: v_movreld_b32_e32 v1, v32 -; GCN-NEXT: v_mov_b32_e32 v33, s3 ; GCN-NEXT: v_mov_b32_e32 v32, s2 ; GCN-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-NEXT: v_mov_b32_e32 v33, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[32:33], v[28:31] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v29, s3 ; GCN-NEXT: v_mov_b32_e32 v28, s2 ; GCN-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-NEXT: v_mov_b32_e32 v29, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[28:29], v[24:27] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v25, s3 ; GCN-NEXT: v_mov_b32_e32 v24, s2 ; GCN-NEXT: s_add_u32 s2, s0, 64 +; GCN-NEXT: v_mov_b32_e32 v25, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[24:25], v[20:23] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v21, s3 ; GCN-NEXT: v_mov_b32_e32 v20, s2 ; GCN-NEXT: s_add_u32 s2, s0, 48 +; GCN-NEXT: v_mov_b32_e32 v21, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: v_mov_b32_e32 v16, s2 ; GCN-NEXT: s_add_u32 s2, s0, 32 +; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: v_mov_b32_e32 v12, s2 ; GCN-NEXT: s_add_u32 s2, s0, 16 +; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 @@ -898,43 +899,44 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-NEXT: v_movreld_b32_e32 v1, v32 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v31, s3 ; GCN-NEXT: v_mov_b32_e32 v30, s2 ; GCN-NEXT: s_add_u32 s2, s0, 64 +; GCN-NEXT: v_mov_b32_e32 v31, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[30:31], v[20:23] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v21, s3 ; GCN-NEXT: v_mov_b32_e32 v20, s2 ; GCN-NEXT: s_add_u32 s2, s0, 48 +; GCN-NEXT: v_mov_b32_e32 v21, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: v_mov_b32_e32 v16, s2 ; GCN-NEXT: s_add_u32 s2, s0, 32 +; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: v_mov_b32_e32 v12, s2 ; GCN-NEXT: s_add_u32 s2, s0, 16 +; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v9, s3 ; GCN-NEXT: v_mov_b32_e32 v8, s2 -; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-NEXT: v_mov_b32_e32 v9, s3 +; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: s_add_u32 s0, s0, 0x60 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_add_u32 s0, s0, 0x60 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[28:29] -; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx4 v[0:1], v[24:27] @@ -1830,9 +1832,9 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_readlane_b32 s0, v6, 0 ; GCN-NEXT: v_readlane_b32 s1, v6, 1 -; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 7cbf9aeacfe48..b93b29464b309 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1979,10 +1979,10 @@ define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace( ; SI-NEXT: .LBB42_2: ; %if ; SI-NEXT: s_load_dword s5, s[2:3], 0x0 ; SI-NEXT: .LBB42_3: ; %endif -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -2003,10 +2003,10 @@ define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s5, s[2:3], 0x0 ; VI-NEXT: .LBB42_3: ; %endif -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll index 300124848c1aa..cb9b6888d1fbc 100644 --- a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll @@ -40,10 +40,10 @@ define amdgpu_gfx [13 x i32] @issue130120() { ; CHECK-NEXT: s_mov_b32 s50, s48 ; CHECK-NEXT: s_cselect_b32 s51, 0, s1 ; CHECK-NEXT: s_cselect_b32 s55, 0, s35 -; CHECK-NEXT: v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49 ; CHECK-NEXT: s_cselect_b32 s52, 0, s2 ; CHECK-NEXT: s_cselect_b32 s56, 0, s36 ; CHECK-NEXT: s_cselect_b32 vcc_lo, 0, s43 +; CHECK-NEXT: v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49 ; CHECK-NEXT: v_mov_b32_e32 v4, s50 ; CHECK-NEXT: s_cselect_b32 s47, s45, 0xf0 ; CHECK-NEXT: s_cselect_b32 s53, 0, s3 diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll index c001df48499c7..f316f3d5defaa 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll @@ -98,10 +98,10 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_or_b32_e32 v1, v1, v5 ; GCN-NEXT: v_or_b32_e32 v0, v0, v4 ; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_mov_b32_e32 v1, v9 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: v_or_b32_e32 v8, v15, v0 ; GCN-NEXT: v_mov_b32_e32 v0, v8 -; GCN-NEXT: v_mov_b32_e32 v1, v9 ; GCN-NEXT: .LBB0_7: ; %Flow1 ; GCN-NEXT: s_or_b64 exec, exec, s[12:13] ; GCN-NEXT: .LBB0_8: ; %Flow2 @@ -227,10 +227,10 @@ define bfloat @uitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_or_b32_e32 v1, v1, v3 ; GCN-NEXT: v_or_b32_e32 v0, v0, v2 ; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_mov_b32_e32 v1, v8 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: v_or_b32_e32 v7, v14, v0 ; GCN-NEXT: v_mov_b32_e32 v0, v7 -; GCN-NEXT: v_mov_b32_e32 v1, v8 ; GCN-NEXT: .LBB1_7: ; %Flow1 ; GCN-NEXT: s_or_b64 exec, exec, s[12:13] ; GCN-NEXT: .LBB1_8: ; %Flow2 diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll index 2f9182e6e7c6a..56df10707667f 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll @@ -94,10 +94,10 @@ define float @sitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v1, v1, v5 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_mov_b32_e32 v1, v9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v8, v15, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, v8 -; SDAG-NEXT: v_mov_b32_e32 v1, v9 ; SDAG-NEXT: .LBB0_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB0_8: ; %Flow2 @@ -224,12 +224,10 @@ define float @sitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 ; GISEL-NEXT: v_and_or_b32 v1, v11, v1, v3 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v3, v13, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mov_b32_e32 v1, v4 -; GISEL-NEXT: v_mov_b32_e32 v2, v5 -; GISEL-NEXT: v_mov_b32_e32 v3, v6 ; GISEL-NEXT: .LBB0_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: .LBB0_8: ; %Flow2 @@ -349,10 +347,10 @@ define float @uitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v1, v1, v3 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_mov_b32_e32 v1, v8 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v7, v14, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, v7 -; SDAG-NEXT: v_mov_b32_e32 v1, v8 ; SDAG-NEXT: .LBB1_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB1_8: ; %Flow2 @@ -469,12 +467,10 @@ define float @uitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 ; GISEL-NEXT: v_and_or_b32 v1, v10, v1, v3 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v3, v12, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mov_b32_e32 v1, v4 -; GISEL-NEXT: v_mov_b32_e32 v2, v5 -; GISEL-NEXT: v_mov_b32_e32 v3, v6 ; GISEL-NEXT: .LBB1_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: .LBB1_8: ; %Flow2 @@ -610,11 +606,11 @@ define double @sitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v5, v5, v7 ; SDAG-NEXT: v_or_b32_e32 v4, v4, v6 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v6, v10 +; SDAG-NEXT: v_mov_b32_e32 v5, v1 ; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 ; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: v_mov_b32_e32 v6, v10 ; SDAG-NEXT: v_mov_b32_e32 v7, v11 ; SDAG-NEXT: .LBB2_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] @@ -661,9 +657,9 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v4, v0 ; GISEL-NEXT: v_mov_b32_e32 v5, v1 -; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_or_b32_e32 v0, v4, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v5, v3 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-NEXT: v_mov_b32_e32 v1, s5 @@ -756,27 +752,24 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_and_or_b32 v0, v9, v2, v0 ; GISEL-NEXT: v_and_or_b32 v1, v12, v3, v1 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v3, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v9, v14, v0 ; GISEL-NEXT: v_mov_b32_e32 v2, v9 -; GISEL-NEXT: v_mov_b32_e32 v3, v10 ; GISEL-NEXT: v_mov_b32_e32 v4, v11 -; GISEL-NEXT: v_mov_b32_e32 v5, v12 ; GISEL-NEXT: .LBB2_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: .LBB2_8: ; %Flow2 ; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] -; GISEL-NEXT: s_cbranch_execz .LBB2_10 ; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb ; GISEL-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] ; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[2:3] ; GISEL-NEXT: v_lshrrev_b32_e32 v2, 31, v3 ; GISEL-NEXT: v_or_b32_e32 v2, v4, v2 -; GISEL-NEXT: v_mov_b32_e32 v5, v3 ; GISEL-NEXT: v_mov_b32_e32 v4, v2 ; GISEL-NEXT: v_mov_b32_e32 v3, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v0 -; GISEL-NEXT: .LBB2_10: ; %itofp-sw-epilog +; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GISEL-NEXT: v_bfe_u32 v0, v2, 2, 1 ; GISEL-NEXT: v_or_b32_e32 v0, v2, v0 @@ -898,11 +891,11 @@ define double @uitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v1, v1, v3 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; SDAG-NEXT: v_mov_b32_e32 v2, v9 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v4, v4, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, v4 -; SDAG-NEXT: v_mov_b32_e32 v1, v5 +; SDAG-NEXT: v_mov_b32_e32 v2, v9 ; SDAG-NEXT: v_mov_b32_e32 v3, v10 ; SDAG-NEXT: .LBB3_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] @@ -945,9 +938,9 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-LABEL: uitofp_i128_to_f64: ; GISEL: ; %bb.0: ; %itofp-entry ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1032,10 +1025,10 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_and_or_b32 v0, v8, v0, v2 ; GISEL-NEXT: v_and_or_b32 v1, v12, v1, v3 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v8, v14, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v8 -; GISEL-NEXT: v_mov_b32_e32 v1, v9 ; GISEL-NEXT: v_mov_b32_e32 v2, v10 ; GISEL-NEXT: v_mov_b32_e32 v3, v11 ; GISEL-NEXT: .LBB3_7: ; %Flow1 @@ -1044,8 +1037,8 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] ; GISEL-NEXT: s_cbranch_execz .LBB3_10 ; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb -; GISEL-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] ; GISEL-NEXT: v_lshlrev_b64 v[10:11], 1, v[2:3] +; GISEL-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] ; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GISEL-NEXT: v_or_b32_e32 v10, v10, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v8 @@ -1183,10 +1176,10 @@ define half @sitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v1, v1, v5 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_mov_b32_e32 v1, v9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v8, v15, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, v8 -; SDAG-NEXT: v_mov_b32_e32 v1, v9 ; SDAG-NEXT: .LBB4_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB4_8: ; %Flow2 @@ -1314,12 +1307,10 @@ define half @sitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 ; GISEL-NEXT: v_and_or_b32 v1, v11, v1, v3 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v3, v13, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mov_b32_e32 v1, v4 -; GISEL-NEXT: v_mov_b32_e32 v2, v5 -; GISEL-NEXT: v_mov_b32_e32 v3, v6 ; GISEL-NEXT: .LBB4_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: .LBB4_8: ; %Flow2 @@ -1440,10 +1431,10 @@ define half @uitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v1, v1, v3 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_mov_b32_e32 v1, v8 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v7, v14, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, v7 -; SDAG-NEXT: v_mov_b32_e32 v1, v8 ; SDAG-NEXT: .LBB5_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB5_8: ; %Flow2 @@ -1561,12 +1552,10 @@ define half @uitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 ; GISEL-NEXT: v_and_or_b32 v1, v10, v1, v3 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v3, v12, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mov_b32_e32 v1, v4 -; GISEL-NEXT: v_mov_b32_e32 v2, v5 -; GISEL-NEXT: v_mov_b32_e32 v3, v6 ; GISEL-NEXT: .LBB5_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: .LBB5_8: ; %Flow2 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index a2da8876472ab..a05977d630217 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -734,8 +734,8 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -797,8 +797,8 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -863,11 +863,11 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 2 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_byte v[2:3], v5 @@ -1002,10 +1002,10 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v5, s2 ; VI-NEXT: flat_store_short v[2:3], v4 @@ -1121,9 +1121,9 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-NEXT: s_endpgm @@ -1200,9 +1200,9 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-NEXT: s_endpgm @@ -1335,8 +1335,8 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -1400,8 +1400,8 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -1474,8 +1474,8 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -1549,10 +1549,10 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v5, s2 ; VI-NEXT: flat_store_byte v[2:3], v4 @@ -1693,13 +1693,13 @@ define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16> ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 8 ; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[2:3], v4 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -1926,13 +1926,13 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s6, 16 ; VI-NEXT: s_addc_u32 s5, s7, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -2024,15 +2024,15 @@ define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s6, 16 ; VI-NEXT: s_addc_u32 s5, s7, 0 -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v3, s8 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_store_dword v[1:2], v3 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -2129,19 +2129,19 @@ define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s12, s8, 32 -; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: s_addc_u32 s13, s9, 0 -; VI-NEXT: v_mov_b32_e32 v3, s12 +; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: v_mov_b32_e32 v2, s11 ; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v3, s12 ; VI-NEXT: v_mov_b32_e32 v4, s13 ; VI-NEXT: s_add_u32 s4, s8, 16 ; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s9, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v4, s8 @@ -2271,19 +2271,19 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s12, s8, 32 -; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: s_addc_u32 s13, s9, 0 -; VI-NEXT: v_mov_b32_e32 v3, s12 +; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: v_mov_b32_e32 v2, s11 ; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v3, s12 ; VI-NEXT: v_mov_b32_e32 v4, s13 ; VI-NEXT: s_add_u32 s4, s8, 16 ; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s9, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v4, s8 @@ -2400,8 +2400,8 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -2653,8 +2653,8 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -2909,10 +2909,10 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -3020,10 +3020,10 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -3124,8 +3124,8 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -3582,10 +3582,10 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -4050,21 +4050,21 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32 ; VI-NEXT: v_mov_b32_e32 v0, s20 ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 ; VI-NEXT: v_mov_b32_e32 v1, s21 ; VI-NEXT: v_mov_b32_e32 v2, s22 ; VI-NEXT: v_mov_b32_e32 v3, s23 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s3 @@ -4238,21 +4238,21 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo ; VI-NEXT: v_mov_b32_e32 v0, s20 ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 ; VI-NEXT: v_mov_b32_e32 v1, s21 ; VI-NEXT: v_mov_b32_e32 v2, s22 ; VI-NEXT: v_mov_b32_e32 v3, s23 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s3 @@ -4544,13 +4544,13 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s4, s6, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 8 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_byte v[4:5], v6 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -4563,8 +4563,8 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_byte v2, v3, s[0:1] offset:8 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -4821,8 +4821,8 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 1 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -4971,8 +4971,8 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -5095,10 +5095,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5120,10 +5120,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5208,8 +5208,9 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -5231,18 +5232,18 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: s_add_u32 s2, s4, 50 ; VI-NEXT: s_addc_u32 s3, s5, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s0, 3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_add_u32 s0, s4, 51 -; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v7, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: v_mov_b32_e32 v7, s1 ; VI-NEXT: flat_load_ubyte v8, v[0:1] ; VI-NEXT: flat_load_ubyte v9, v[2:3] ; VI-NEXT: flat_load_ubyte v10, v[4:5] @@ -5680,12 +5681,12 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: s_add_u32 s2, s0, 2 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s4, 42 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ushort v4, v[4:5] @@ -6011,22 +6012,22 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace ; VI-NEXT: v_mov_b32_e32 v0, s20 ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 ; VI-NEXT: v_mov_b32_e32 v1, s21 ; VI-NEXT: v_mov_b32_e32 v2, s22 ; VI-NEXT: v_mov_b32_e32 v3, s23 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll index 2fa865ff4929c..c19f6f3b810e4 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll @@ -133,8 +133,8 @@ define amdgpu_kernel void @v5i32_arg(<5 x i32> %in) nounwind { ; GCN-NEXT: v_mov_b32_e32 v6, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v8, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -156,8 +156,8 @@ define amdgpu_kernel void @v6i32_arg(<6 x i32> %in) nounwind { ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v9, s5 ; GCN-NEXT: v_mov_b32_e32 v8, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -176,8 +176,8 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) #0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s4, s4, 1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: global_store_byte v2, v3, s[0:1] offset:8 ; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -216,10 +216,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: global_store_dword v[0:1], v2, off ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -578,8 +578,8 @@ define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, p ; GCN-LABEL: byref_flat_i32_arg: ; GCN: ; %bb.0: ; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: flat_load_dword v0, v[0:1] offset:8 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll index 217c306a1ff93..d262f7dc03ac6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll @@ -59,8 +59,8 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_f32_vv(<16 x float> %src, float %s define amdgpu_ps void @test_scalef32_pk32_fp6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) { ; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f32_sl: ; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9] @@ -119,8 +119,8 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_f32_vv(<16 x float> %src, float %s define amdgpu_ps void @test_scalef32_pk32_bf6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) { ; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f32_sl: ; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9] @@ -1477,8 +1477,8 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_f32_vv_inreg_src(<16 x float> inre define amdgpu_ps void @test_scalef32_pk32_fp6_f32_sl_inreg_src(<16 x float> inreg %src, ptr addrspace(1) %out) { ; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f32_sl_inreg_src: ; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9] @@ -1553,8 +1553,8 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_f32_vv_inreg_src(<16 x float> inre define amdgpu_ps void @test_scalef32_pk32_bf6_f32_sl_inreg_src(<16 x float> inreg inreg %src, ptr addrspace(1) %out) { ; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f32_sl_inreg_src: ; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9] @@ -2197,8 +2197,8 @@ define <32 x float> @test_cvt_scale_pk32_f32_fp6_inreg_src(<6 x i32> inreg %src, ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[34:39], v32 @@ -2226,8 +2226,8 @@ define <32 x float> @test_cvt_scale_pk32_f32_bf6_inreg_src(<6 x i32> inreg %src, ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[34:39], v32 @@ -2255,8 +2255,8 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_vv_inreg_src(<6 x i32> inreg ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[18:23], v16 @@ -2347,8 +2347,8 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_vv_inreg_src(<6 x i32> inreg ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[18:23], v16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll index 6cc6ba732d805..3804652e2a203 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll @@ -150,21 +150,21 @@ define %trivial_types @dead_struct(i1 %cond, %trivial_types %x, ptr addrspace(1) ; ASM-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 ; ASM-GISEL-TRUE16-NEXT: s_mov_b32 s1, 0x3fc00000 ; ASM-GISEL-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; ASM-GISEL-TRUE16-NEXT: s_mov_b32 s7, s4 ; ASM-GISEL-TRUE16-NEXT: s_mov_b32 s5, s4 ; ASM-GISEL-TRUE16-NEXT: s_mov_b32 s6, s4 +; ASM-GISEL-TRUE16-NEXT: s_mov_b32 s7, s4 +; ASM-GISEL-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_add_nc_u32 v0, 15, v19 ; ASM-GISEL-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; ASM-GISEL-TRUE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v13, s6 -; ASM-GISEL-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_add_nc_u32 v0, 15, v19 ; ASM-GISEL-TRUE16-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v11, s4 +; ASM-GISEL-TRUE16-NEXT: global_store_b32 v[17:18], v0, off +; ASM-GISEL-TRUE16-NEXT: ; implicit-def: $vgpr0 ; ASM-GISEL-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; ASM-GISEL-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7 ; ASM-GISEL-TRUE16-NEXT: ; implicit-def: $vgpr8 ; ASM-GISEL-TRUE16-NEXT: ; implicit-def: $vgpr9_vgpr10 ; ASM-GISEL-TRUE16-NEXT: ; implicit-def: $vgpr15 ; ASM-GISEL-TRUE16-NEXT: ; implicit-def: $vgpr16 -; ASM-GISEL-TRUE16-NEXT: global_store_b32 v[17:18], v0, off -; ASM-GISEL-TRUE16-NEXT: ; implicit-def: $vgpr0 ; ASM-GISEL-TRUE16-NEXT: .LBB1_2: ; %if.end ; ASM-GISEL-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; ASM-GISEL-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -195,21 +195,21 @@ define %trivial_types @dead_struct(i1 %cond, %trivial_types %x, ptr addrspace(1) ; ASM-GISEL-FAKE16-NEXT: s_mov_b32 s4, 0 ; ASM-GISEL-FAKE16-NEXT: s_mov_b32 s1, 0x3fc00000 ; ASM-GISEL-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; ASM-GISEL-FAKE16-NEXT: s_mov_b32 s7, s4 ; ASM-GISEL-FAKE16-NEXT: s_mov_b32 s5, s4 ; ASM-GISEL-FAKE16-NEXT: s_mov_b32 s6, s4 +; ASM-GISEL-FAKE16-NEXT: s_mov_b32 s7, s4 +; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_add_nc_u32 v0, 15, v19 ; ASM-GISEL-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v13, s6 -; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_add_nc_u32 v0, 15, v19 ; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v11, s4 +; ASM-GISEL-FAKE16-NEXT: global_store_b32 v[17:18], v0, off +; ASM-GISEL-FAKE16-NEXT: ; implicit-def: $vgpr0 ; ASM-GISEL-FAKE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; ASM-GISEL-FAKE16-NEXT: ; implicit-def: $vgpr6_vgpr7 ; ASM-GISEL-FAKE16-NEXT: ; implicit-def: $vgpr8 ; ASM-GISEL-FAKE16-NEXT: ; implicit-def: $vgpr9_vgpr10 ; ASM-GISEL-FAKE16-NEXT: ; implicit-def: $vgpr15 ; ASM-GISEL-FAKE16-NEXT: ; implicit-def: $vgpr16 -; ASM-GISEL-FAKE16-NEXT: global_store_b32 v[17:18], v0, off -; ASM-GISEL-FAKE16-NEXT: ; implicit-def: $vgpr0 ; ASM-GISEL-FAKE16-NEXT: .LBB1_2: ; %if.end ; ASM-GISEL-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; ASM-GISEL-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -221,6 +221,7 @@ define %trivial_types @dead_struct(i1 %cond, %trivial_types %x, ptr addrspace(1) ; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v12, v13 :: v_dual_mov_b32 v13, v14 ; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v14, v15 :: v_dual_mov_b32 v15, v16 ; ASM-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] + ; ASM-GISEL-LABEL: dead_struct: ; ASM-GISEL: ; %bb.0: ; %entry ; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -515,6 +516,7 @@ define [32 x i32] @dead_array(i1 %cond, [32 x i32] %x, ptr addrspace(1) %ptr1, i ; ASM-GISEL-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; ASM-GISEL-FAKE16-NEXT: s_wait_loadcnt 0x0 ; ASM-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] + ; ASM-GISEL-LABEL: dead_array: ; ASM-GISEL: ; %bb.0: ; %entry ; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll index 9e4824694e76a..38e91757b9763 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll @@ -45,8 +45,8 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -58,8 +58,8 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -101,8 +101,8 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -114,8 +114,8 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -204,8 +204,8 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -218,8 +218,8 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -264,8 +264,8 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -278,8 +278,8 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -324,8 +324,8 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -338,8 +338,8 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -384,8 +384,8 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -398,8 +398,8 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -444,8 +444,8 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -458,8 +458,8 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -504,8 +504,8 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -518,8 +518,8 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -564,8 +564,8 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -578,8 +578,8 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -624,8 +624,8 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -638,8 +638,8 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -684,8 +684,8 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -698,8 +698,8 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -744,8 +744,8 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -758,8 +758,8 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -804,8 +804,8 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -818,8 +818,8 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -864,8 +864,8 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -878,8 +878,8 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -924,8 +924,8 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -938,8 +938,8 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -984,8 +984,8 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -998,8 +998,8 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -1861,8 +1861,8 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -1876,8 +1876,8 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -1928,8 +1928,8 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -1943,8 +1943,8 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2034,8 +2034,8 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2048,8 +2048,8 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2095,8 +2095,8 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2109,8 +2109,8 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2156,8 +2156,8 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2170,8 +2170,8 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2217,8 +2217,8 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2231,8 +2231,8 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2278,8 +2278,8 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2292,8 +2292,8 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2339,8 +2339,8 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2353,8 +2353,8 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2400,8 +2400,8 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2414,8 +2414,8 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2461,8 +2461,8 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2475,8 +2475,8 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2522,8 +2522,8 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2536,8 +2536,8 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2583,8 +2583,8 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2597,8 +2597,8 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2644,8 +2644,8 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2658,8 +2658,8 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2704,8 +2704,8 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2718,8 +2718,8 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2764,8 +2764,8 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2778,8 +2778,8 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2824,8 +2824,8 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2838,8 +2838,8 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll index 366b71bae75c9..e1c671c4eeb0a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll @@ -44,8 +44,8 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -71,8 +71,8 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -146,8 +146,8 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -173,8 +173,8 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -206,8 +206,8 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -233,8 +233,8 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -266,8 +266,8 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -293,8 +293,8 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -326,8 +326,8 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -353,8 +353,8 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -386,8 +386,8 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -413,8 +413,8 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -446,8 +446,8 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -473,8 +473,8 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -506,8 +506,8 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -533,8 +533,8 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -566,8 +566,8 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -593,8 +593,8 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -626,8 +626,8 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -653,8 +653,8 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -684,8 +684,8 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -742,8 +742,8 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -800,8 +800,8 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -858,8 +858,8 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -916,8 +916,8 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -974,8 +974,8 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -1032,8 +1032,8 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -1090,8 +1090,8 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -1148,8 +1148,8 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -1206,8 +1206,8 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -1266,8 +1266,8 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1293,8 +1293,8 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1368,8 +1368,8 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1395,8 +1395,8 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1428,8 +1428,8 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1455,8 +1455,8 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1488,8 +1488,8 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1515,8 +1515,8 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1548,8 +1548,8 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1575,8 +1575,8 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1608,8 +1608,8 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1635,8 +1635,8 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1668,8 +1668,8 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1695,8 +1695,8 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1728,8 +1728,8 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1755,8 +1755,8 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1788,8 +1788,8 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1815,8 +1815,8 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1848,8 +1848,8 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1875,8 +1875,8 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1912,8 +1912,8 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll index 80f295b939709..0f1a487d13431 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll @@ -15,8 +15,8 @@ define amdgpu_kernel void @MFMAExpInterleave(ptr addrspace(1) %out0, ptr addrspa ; GCN-NEXT: v_sub_f32_e32 v4, v2, v3 ; GCN-NEXT: v_fma_f32 v1, s6, v1, -v2 ; GCN-NEXT: v_mov_b32_e32 v2, 0x32a5705f -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 ; GCN-NEXT: v_fmac_f32_e32 v1, s6, v2 +; GCN-NEXT: v_accvgpr_write_b32 a0, s0 ; GCN-NEXT: v_accvgpr_write_b32 a1, s1 ; GCN-NEXT: v_accvgpr_write_b32 a2, s2 ; GCN-NEXT: v_accvgpr_write_b32 a3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll index 44a4e8171ff33..3df81ac2e551a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll @@ -18,11 +18,11 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GCN-LABEL: load_1d_lwe: ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v6, v0 ; GCN-NEXT: v_mov_b32_e32 v9, v8 ; GCN-NEXT: v_mov_b32_e32 v10, v8 ; GCN-NEXT: v_mov_b32_e32 v11, v8 ; GCN-NEXT: v_mov_b32_e32 v12, v8 +; GCN-NEXT: v_mov_b32_e32 v6, v0 ; GCN-NEXT: v_mov_b32_e32 v0, v8 ; GCN-NEXT: v_mov_b32_e32 v1, v9 ; GCN-NEXT: v_mov_b32_e32 v2, v10 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll index 4d9f0943a802d..fd9af6e536617 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -84,11 +84,11 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GFX6789-LABEL: load_1d_tfe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v6, 0 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v7, v6 ; GFX6789-NEXT: v_mov_b32_e32 v8, v6 ; GFX6789-NEXT: v_mov_b32_e32 v9, v6 ; GFX6789-NEXT: v_mov_b32_e32 v10, v6 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v6 ; GFX6789-NEXT: v_mov_b32_e32 v1, v7 ; GFX6789-NEXT: v_mov_b32_e32 v2, v8 @@ -131,13 +131,13 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GFX11-LABEL: load_1d_tfe: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: v_mov_b32_e32 v10, v6 -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10 ; GFX11-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v6, v4, s[8:9] @@ -199,11 +199,11 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GFX6789-LABEL: load_1d_lwe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v6, 0 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v7, v6 ; GFX6789-NEXT: v_mov_b32_e32 v8, v6 ; GFX6789-NEXT: v_mov_b32_e32 v9, v6 ; GFX6789-NEXT: v_mov_b32_e32 v10, v6 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v6 ; GFX6789-NEXT: v_mov_b32_e32 v1, v7 ; GFX6789-NEXT: v_mov_b32_e32 v2, v8 @@ -246,13 +246,13 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GFX11-LABEL: load_1d_lwe: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: v_mov_b32_e32 v10, v6 -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10 ; GFX11-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v6, v4, s[8:9] @@ -352,12 +352,12 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GFX6789-LABEL: load_2d_tfe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v7, 0 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v8, v7 ; GFX6789-NEXT: v_mov_b32_e32 v9, v7 ; GFX6789-NEXT: v_mov_b32_e32 v10, v7 ; GFX6789-NEXT: v_mov_b32_e32 v11, v7 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v7 ; GFX6789-NEXT: v_mov_b32_e32 v1, v8 ; GFX6789-NEXT: v_mov_b32_e32 v2, v9 @@ -401,13 +401,14 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GFX11-LABEL: load_2d_tfe: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, v1 -; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: v_mov_b32_e32 v9, v7 +; GFX11-NEXT: v_mov_b32_e32 v2, v9 ; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v11, v7 -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[8:9] @@ -417,11 +418,12 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v7, 0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v0 -; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v9, v7 -; GFX12-NEXT: v_dual_mov_b32 v10, v7 :: v_dual_mov_b32 v11, v7 +; GFX12-NEXT: v_mov_b32_e32 v9, v7 +; GFX12-NEXT: v_mov_b32_e32 v2, v9 +; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v11, v7 +; GFX12-NEXT: v_mov_b32_e32 v10, v7 ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: v_mov_b32_e32 v4, v11 +; GFX12-NEXT: v_dual_mov_b32 v4, v11 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: image_load v[0:4], [v6, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v7, v4, s[8:9] @@ -515,13 +517,13 @@ define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspa ; GFX6789-LABEL: load_3d_tfe_lwe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v8, 0 -; GFX6789-NEXT: v_mov_b32_e32 v7, v2 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v9, v8 ; GFX6789-NEXT: v_mov_b32_e32 v10, v8 ; GFX6789-NEXT: v_mov_b32_e32 v11, v8 ; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v7, v2 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v8 ; GFX6789-NEXT: v_mov_b32_e32 v1, v9 ; GFX6789-NEXT: v_mov_b32_e32 v2, v10 @@ -567,13 +569,13 @@ define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspa ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 -; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 -; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -681,13 +683,13 @@ define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace ; GFX6789-LABEL: load_cube_lwe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v8, 0 -; GFX6789-NEXT: v_mov_b32_e32 v7, v2 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v9, v8 ; GFX6789-NEXT: v_mov_b32_e32 v10, v8 ; GFX6789-NEXT: v_mov_b32_e32 v11, v8 ; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v7, v2 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v8 ; GFX6789-NEXT: v_mov_b32_e32 v1, v9 ; GFX6789-NEXT: v_mov_b32_e32 v2, v10 @@ -733,13 +735,13 @@ define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 -; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 -; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -839,12 +841,12 @@ define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, ptr addrsp ; GFX6789-LABEL: load_1darray_tfe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v7, 0 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v8, v7 ; GFX6789-NEXT: v_mov_b32_e32 v9, v7 ; GFX6789-NEXT: v_mov_b32_e32 v10, v7 ; GFX6789-NEXT: v_mov_b32_e32 v11, v7 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v7 ; GFX6789-NEXT: v_mov_b32_e32 v1, v8 ; GFX6789-NEXT: v_mov_b32_e32 v2, v9 @@ -888,13 +890,14 @@ define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, ptr addrsp ; GFX11-LABEL: load_1darray_tfe: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, v1 -; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: v_mov_b32_e32 v9, v7 +; GFX11-NEXT: v_mov_b32_e32 v2, v9 ; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v11, v7 -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[8:9] @@ -904,11 +907,12 @@ define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, ptr addrsp ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v7, 0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v0 -; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v9, v7 -; GFX12-NEXT: v_dual_mov_b32 v10, v7 :: v_dual_mov_b32 v11, v7 +; GFX12-NEXT: v_mov_b32_e32 v9, v7 +; GFX12-NEXT: v_mov_b32_e32 v2, v9 +; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v11, v7 +; GFX12-NEXT: v_mov_b32_e32 v10, v7 ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: v_mov_b32_e32 v4, v11 +; GFX12-NEXT: v_dual_mov_b32 v4, v11 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: image_load v[0:4], [v6, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v7, v4, s[8:9] @@ -1002,13 +1006,13 @@ define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrsp ; GFX6789-LABEL: load_2darray_lwe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v8, 0 -; GFX6789-NEXT: v_mov_b32_e32 v7, v2 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v9, v8 ; GFX6789-NEXT: v_mov_b32_e32 v10, v8 ; GFX6789-NEXT: v_mov_b32_e32 v11, v8 ; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v7, v2 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v8 ; GFX6789-NEXT: v_mov_b32_e32 v1, v9 ; GFX6789-NEXT: v_mov_b32_e32 v2, v10 @@ -1054,13 +1058,13 @@ define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrsp ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 -; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 -; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -1162,13 +1166,13 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrsp ; GFX6789-LABEL: load_2dmsaa_both: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v8, 0 -; GFX6789-NEXT: v_mov_b32_e32 v7, v2 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v9, v8 ; GFX6789-NEXT: v_mov_b32_e32 v10, v8 ; GFX6789-NEXT: v_mov_b32_e32 v11, v8 ; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v7, v2 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v8 ; GFX6789-NEXT: v_mov_b32_e32 v1, v9 ; GFX6789-NEXT: v_mov_b32_e32 v2, v10 @@ -1214,13 +1218,13 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrsp ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 -; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 -; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -1330,14 +1334,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad ; GFX6789-LABEL: load_2darraymsaa_tfe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v9, 0 -; GFX6789-NEXT: v_mov_b32_e32 v8, v3 -; GFX6789-NEXT: v_mov_b32_e32 v7, v2 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v10, v9 ; GFX6789-NEXT: v_mov_b32_e32 v11, v9 ; GFX6789-NEXT: v_mov_b32_e32 v12, v9 ; GFX6789-NEXT: v_mov_b32_e32 v13, v9 +; GFX6789-NEXT: v_mov_b32_e32 v8, v3 +; GFX6789-NEXT: v_mov_b32_e32 v7, v2 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v9 ; GFX6789-NEXT: v_mov_b32_e32 v1, v10 ; GFX6789-NEXT: v_mov_b32_e32 v2, v11 @@ -1384,13 +1388,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, v3 ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, v9 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: v_mov_b32_e32 v11, v9 +; GFX11-NEXT: v_mov_b32_e32 v2, v11 ; GFX11-NEXT: v_mov_b32_e32 v12, v9 +; GFX11-NEXT: v_mov_b32_e32 v10, v9 ; GFX11-NEXT: v_mov_b32_e32 v13, v9 -; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 -; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 -; GFX11-NEXT: v_mov_b32_e32 v4, v13 +; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v3, v12 +; GFX11-NEXT: v_dual_mov_b32 v1, v10 :: v_dual_mov_b32 v4, v13 ; GFX11-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v9, v4, s[8:9] @@ -1401,11 +1406,12 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad ; GFX12-NEXT: v_mov_b32_e32 v9, 0 ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v6, v2 ; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v11, v9 -; GFX12-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v9 +; GFX12-NEXT: v_mov_b32_e32 v11, v9 +; GFX12-NEXT: v_mov_b32_e32 v2, v11 +; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v13, v9 +; GFX12-NEXT: v_mov_b32_e32 v12, v9 ; GFX12-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 -; GFX12-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 -; GFX12-NEXT: v_mov_b32_e32 v4, v13 +; GFX12-NEXT: v_dual_mov_b32 v4, v13 :: v_dual_mov_b32 v3, v12 ; GFX12-NEXT: image_load v[0:4], [v8, v7, v6, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v9, v4, s[8:9] @@ -1497,12 +1503,12 @@ define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspa ; GFX6789-LABEL: load_mip_1d_lwe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v7, 0 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v8, v7 ; GFX6789-NEXT: v_mov_b32_e32 v9, v7 ; GFX6789-NEXT: v_mov_b32_e32 v10, v7 ; GFX6789-NEXT: v_mov_b32_e32 v11, v7 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v7 ; GFX6789-NEXT: v_mov_b32_e32 v1, v8 ; GFX6789-NEXT: v_mov_b32_e32 v2, v9 @@ -1546,13 +1552,14 @@ define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspa ; GFX11-LABEL: load_mip_1d_lwe: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, v1 -; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: v_mov_b32_e32 v9, v7 +; GFX11-NEXT: v_mov_b32_e32 v2, v9 ; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v11, v7 -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[8:9] @@ -1654,13 +1661,13 @@ define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspa ; GFX6789-LABEL: load_mip_2d_tfe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v8, 0 -; GFX6789-NEXT: v_mov_b32_e32 v7, v2 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v9, v8 ; GFX6789-NEXT: v_mov_b32_e32 v10, v8 ; GFX6789-NEXT: v_mov_b32_e32 v11, v8 ; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v7, v2 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v8 ; GFX6789-NEXT: v_mov_b32_e32 v1, v9 ; GFX6789-NEXT: v_mov_b32_e32 v2, v10 @@ -1706,13 +1713,13 @@ define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspa ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 -; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 -; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -2133,10 +2140,10 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, ptr a ; GFX6789-LABEL: load_1d_tfe_V4_dmask3: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v5, 0 -; GFX6789-NEXT: v_mov_b32_e32 v4, v0 ; GFX6789-NEXT: v_mov_b32_e32 v6, v5 ; GFX6789-NEXT: v_mov_b32_e32 v7, v5 ; GFX6789-NEXT: v_mov_b32_e32 v8, v5 +; GFX6789-NEXT: v_mov_b32_e32 v4, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v5 ; GFX6789-NEXT: v_mov_b32_e32 v1, v6 ; GFX6789-NEXT: v_mov_b32_e32 v2, v7 @@ -2176,11 +2183,12 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, ptr a ; GFX11-LABEL: load_1d_tfe_V4_dmask3: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, 0 -; GFX11-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-NEXT: v_mov_b32_e32 v7, v5 ; GFX11-NEXT: v_mov_b32_e32 v8, v5 -; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_mov_b32_e32 v1, v6 ; GFX11-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v5, v3, s[8:9] @@ -2190,9 +2198,9 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, ptr a ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, 0 ; GFX12-NEXT: v_dual_mov_b32 v6, v5 :: v_dual_mov_b32 v7, v5 -; GFX12-NEXT: v_mov_b32_e32 v8, v5 -; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v1, v6 +; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_mov_b32_e32 v2, v7 ; GFX12-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v5, v3, s[8:9] @@ -2237,9 +2245,9 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, ptr a ; GFX6789-LABEL: load_1d_tfe_V4_dmask2: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v4, 0 -; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v5, v4 ; GFX6789-NEXT: v_mov_b32_e32 v6, v4 +; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v4 ; GFX6789-NEXT: v_mov_b32_e32 v1, v5 ; GFX6789-NEXT: v_mov_b32_e32 v2, v6 @@ -2276,10 +2284,10 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, ptr a ; GFX11-LABEL: load_1d_tfe_V4_dmask2: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v4 ; GFX11-NEXT: v_mov_b32_e32 v6, v4 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: v_mov_b32_e32 v2, v6 +; GFX11-NEXT: v_mov_b32_e32 v5, v4 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: image_load v[0:2], v3, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v4, v2, s[8:9] @@ -2333,8 +2341,8 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, ptr a ; GFX6789-LABEL: load_1d_tfe_V4_dmask1: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v3, 0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v3 +; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v3 ; GFX6789-NEXT: v_mov_b32_e32 v1, v4 ; GFX6789-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe @@ -2422,8 +2430,8 @@ define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, ptr a ; GFX6789-LABEL: load_1d_tfe_V2_dmask1: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v3, 0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v3 +; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v3 ; GFX6789-NEXT: v_mov_b32_e32 v1, v4 ; GFX6789-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll index 3d64ef16a3c8c..f5c4d08bfe871 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll @@ -26,15 +26,15 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrsp ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x08,0x05] ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x06,0x07] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; encoding: [0x42,0x02,0x87,0xbf] -; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; encoding: [0x52,0x02,0x87,0xbf] ; GFX11-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] -; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 ; encoding: [0x08,0x01,0x10,0xca,0x09,0x01,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x93,0x01,0x87,0xbf] -; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 ; encoding: [0x0a,0x01,0x10,0xca,0x0b,0x01,0x02,0x02] -; GFX11-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 ; encoding: [0x0a,0x01,0x10,0xca,0x09,0x01,0x00,0x02] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; encoding: [0x03,0x00,0x87,0xbf] +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; encoding: [0x0b,0x01,0x10,0xca,0x0c,0x01,0x04,0x03] ; GFX11-NEXT: image_msaa_load v[0:4], v[5:7], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x98,0x02,0x60,0xf0,0x05,0x00,0x60,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x08,0x04,0x08,0x00] @@ -85,15 +85,16 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, v3 ; encoding: [0x80,0x00,0x10,0xca,0x03,0x01,0x08,0x09] ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x06,0x07] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; encoding: [0x42,0x02,0x87,0xbf] -; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, v9 ; encoding: [0x00,0x01,0x10,0xca,0x09,0x01,0x0a,0x05] +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; encoding: [0x93,0x00,0x87,0xbf] ; GFX11-NEXT: v_mov_b32_e32 v11, v9 ; encoding: [0x09,0x03,0x16,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v2, v11 ; encoding: [0x0b,0x03,0x04,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v12, v9 ; encoding: [0x09,0x03,0x18,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v10, v9 ; encoding: [0x09,0x03,0x14,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v13, v9 ; encoding: [0x09,0x03,0x1a,0x7e] -; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 ; encoding: [0x09,0x01,0x10,0xca,0x0a,0x01,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x93,0x01,0x87,0xbf] -; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 ; encoding: [0x0b,0x01,0x10,0xca,0x0c,0x01,0x02,0x02] -; GFX11-NEXT: v_mov_b32_e32 v4, v13 ; encoding: [0x0d,0x03,0x08,0x7e] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x13,0x01,0x87,0xbf] +; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v3, v12 ; encoding: [0x09,0x01,0x10,0xca,0x0c,0x01,0x02,0x00] +; GFX11-NEXT: v_dual_mov_b32 v1, v10 :: v_dual_mov_b32 v4, v13 ; encoding: [0x0a,0x01,0x10,0xca,0x0d,0x01,0x04,0x01] ; GFX11-NEXT: image_msaa_load v[0:4], v[5:8], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x9c,0x08,0x60,0xf0,0x05,0x00,0x20,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: global_store_b32 v9, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x09,0x04,0x08,0x00] @@ -104,13 +105,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad ; GFX12-NEXT: v_mov_b32_e32 v9, 0 ; encoding: [0x80,0x02,0x12,0x7e] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v6, v2 ; encoding: [0x03,0x01,0x10,0xca,0x02,0x01,0x06,0x05] ; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v0 ; encoding: [0x01,0x01,0x10,0xca,0x00,0x01,0x08,0x07] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x23,0x01,0x87,0xbf] -; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v11, v9 ; encoding: [0x09,0x01,0x10,0xca,0x09,0x01,0x0a,0x0a] -; GFX12-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v9 ; encoding: [0x09,0x01,0x10,0xca,0x09,0x01,0x0c,0x0c] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; encoding: [0x93,0x00,0x87,0xbf] +; GFX12-NEXT: v_mov_b32_e32 v11, v9 ; encoding: [0x09,0x03,0x16,0x7e] +; GFX12-NEXT: v_mov_b32_e32 v2, v11 ; encoding: [0x0b,0x03,0x04,0x7e] +; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v13, v9 ; encoding: [0x09,0x01,0x10,0xca,0x09,0x01,0x0c,0x0a] +; GFX12-NEXT: v_mov_b32_e32 v12, v9 ; encoding: [0x09,0x03,0x18,0x7e] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] ; GFX12-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 ; encoding: [0x09,0x01,0x10,0xca,0x0a,0x01,0x00,0x00] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x92,0x01,0x87,0xbf] -; GFX12-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 ; encoding: [0x0b,0x01,0x10,0xca,0x0c,0x01,0x02,0x02] -; GFX12-NEXT: v_mov_b32_e32 v4, v13 ; encoding: [0x0d,0x03,0x08,0x7e] +; GFX12-NEXT: v_dual_mov_b32 v4, v13 :: v_dual_mov_b32 v3, v12 ; encoding: [0x0d,0x01,0x10,0xca,0x0c,0x01,0x02,0x04] ; GFX12-NEXT: image_msaa_load v[0:4], [v8, v7, v6, v5], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x0f,0x20,0x06,0xe6,0x00,0x00,0x00,0x00,0x08,0x07,0x06,0x05] ; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] ; GFX12-NEXT: global_store_b32 v9, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x09,0x00,0x00,0x00] @@ -196,12 +198,11 @@ define amdgpu_ps <4 x half> @load_2dmsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr addr ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v6, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x06,0x03] ; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x04,0x05] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x22,0x01,0x87,0xbf] -; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; encoding: [0x32,0x01,0x87,0xbf] ; GFX11-NEXT: v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e] -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 ; encoding: [0x06,0x01,0x10,0xca,0x07,0x01,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf] -; GFX11-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; encoding: [0x06,0x03,0x00,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7 ; encoding: [0x08,0x01,0x10,0xca,0x07,0x01,0x00,0x02] ; GFX11-NEXT: image_msaa_load v[0:2], v[3:5], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x98,0x01,0x62,0xf0,0x03,0x00,0x20,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: global_store_b32 v6, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x06,0x02,0x08,0x00] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll index 5a35c696c6e44..0f6bed26e7455 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -129,8 +129,7 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX11-TRUE16-NEXT: image_sample v[3:4], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -145,7 +144,7 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, v1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v4 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4 ; GFX11-FAKE16-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX11-FAKE16-NEXT: image_sample v[0:1], v[2:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -158,8 +157,7 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX12-TRUE16-NEXT: image_sample v[3:4], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 @@ -174,7 +172,7 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v3, v0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v4 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4 ; GFX12-FAKE16-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX12-FAKE16-NEXT: image_sample v[0:1], [v3, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll index 8b60aa0e48cda..59c059b2d48a3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -74,11 +74,11 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX6789-NEXT: s_mov_b64 s[14:15], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: v_mov_b32_e32 v6, 0 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v7, v6 ; GFX6789-NEXT: v_mov_b32_e32 v8, v6 ; GFX6789-NEXT: v_mov_b32_e32 v9, v6 ; GFX6789-NEXT: v_mov_b32_e32 v10, v6 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v6 ; GFX6789-NEXT: v_mov_b32_e32 v1, v7 ; GFX6789-NEXT: v_mov_b32_e32 v2, v8 @@ -117,13 +117,13 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX11-NEXT: s_mov_b32 s14, exec_lo ; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: v_mov_b32_e32 v10, v6 -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10 ; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX11-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -634,11 +634,11 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX6789-NEXT: s_mov_b64 s[14:15], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: v_mov_b32_e32 v6, 0 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v7, v6 ; GFX6789-NEXT: v_mov_b32_e32 v8, v6 ; GFX6789-NEXT: v_mov_b32_e32 v9, v6 ; GFX6789-NEXT: v_mov_b32_e32 v10, v6 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v6 ; GFX6789-NEXT: v_mov_b32_e32 v1, v7 ; GFX6789-NEXT: v_mov_b32_e32 v2, v8 @@ -677,13 +677,13 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX11-NEXT: s_mov_b32 s14, exec_lo ; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: v_mov_b32_e32 v10, v6 -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10 ; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX11-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll index 5a27a72de274d..8fafce03b90d1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll @@ -64,8 +64,8 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0 ; DAGISEL12-NEXT: s_mov_b64 exec, s[10:11] -; DAGISEL12-NEXT: v_mov_b32_e32 v11, s12 ; DAGISEL12-NEXT: v_add_nc_u32_e32 v10, 42, v13 +; DAGISEL12-NEXT: v_mov_b32_e32 v11, s12 ; DAGISEL12-NEXT: v_mov_b32_e32 v12, s13 ; DAGISEL12-NEXT: ; %bb.2: ; %tail ; DAGISEL12-NEXT: s_or_b64 exec, exec, s[8:9] @@ -111,8 +111,8 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; DAGISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11] ; DAGISEL10-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0 ; DAGISEL10-NEXT: s_mov_b64 exec, s[10:11] -; DAGISEL10-NEXT: v_mov_b32_e32 v11, s12 ; DAGISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v13 +; DAGISEL10-NEXT: v_mov_b32_e32 v11, s12 ; DAGISEL10-NEXT: v_mov_b32_e32 v12, s13 ; DAGISEL10-NEXT: ; %bb.2: ; %tail ; DAGISEL10-NEXT: s_or_b64 exec, exec, s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index 614566a230f68..d70867d58043f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; TODO: Run these for global isel as well. ; RUN: llc -mtriple=amdgcn -mcpu=gfx1013 < %s | FileCheck -check-prefixes=PRE-GFX12,GFX10,GFX1013 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=PRE-GFX12,GFX10,GFX1030 %s ; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1012 < %s 2>&1 | FileCheck -check-prefix=ERR %s @@ -10,6 +9,8 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s +; TODO: Run these for global isel as well. + ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) @@ -116,15 +117,15 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, f ; ; GFX12-GISEL-TRUE16-LABEL: image_bvh_intersect_ray_a16: ; GFX12-GISEL-TRUE16: ; %bb.0: ; %main_body -; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s20, s2 ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s22, s4 ; GFX12-GISEL-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s20, s2 ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s21, s3 ; GFX12-GISEL-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s8, s6 ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s5, s4 +; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 ; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21 ; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s4 -; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 ; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s16, s9 ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s17, s10 @@ -137,14 +138,14 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, f ; GFX12-GISEL-FAKE16-LABEL: image_bvh_intersect_ray_a16: ; GFX12-GISEL-FAKE16: ; %bb.0: ; %main_body ; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s20, s2 +; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s21, s3 ; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s22, s4 ; GFX12-GISEL-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s7, s5 -; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s21, s3 ; GFX12-GISEL-FAKE16-NEXT: s_pack_hh_b32_b16 s5, s7, s5 ; GFX12-GISEL-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s8, s6 +; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 ; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21 ; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s4 -; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 ; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 ; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s16, s9 ; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s17, s10 @@ -264,9 +265,9 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s22, s5 ; GFX12-GISEL-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s9, s7 ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s5, s4 -; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s4 ; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 ; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v1, s21 +; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s4 ; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v5, s6 ; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v4, s5 ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s16, s10 @@ -281,13 +282,13 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, ; GFX12-GISEL-FAKE16: ; %bb.0: ; %main_body ; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s20, s3 ; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s21, s4 -; GFX12-GISEL-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s8, s6 ; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s22, s5 +; GFX12-GISEL-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s8, s6 ; GFX12-GISEL-FAKE16-NEXT: s_pack_hh_b32_b16 s5, s8, s6 ; GFX12-GISEL-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s9, s7 -; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s4 ; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 ; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v1, s21 +; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s4 ; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v5, s6 ; GFX12-GISEL-FAKE16-NEXT: v_mov_b32_e32 v4, s5 ; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s16, s10 @@ -417,22 +418,24 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX12-GISEL: ; %bb.0: ; %main_body ; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000 ; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x40a00000 ; GFX12-GISEL-NEXT: s_mov_b32 s9, 4.0 -; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000 -; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000 -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX12-GISEL-NEXT: s_mov_b32 s14, 0x41000000 ; GFX12-GISEL-NEXT: s_mov_b32 s13, 0x40e00000 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, s12 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13 +; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v8, s14 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v5, s10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v7, s13 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 2, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-GISEL-NEXT: s_mov_b32 s2, 2.0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -441,9 +444,9 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX12-GISEL-NEXT: flat_load_b32 v9, v[0:1] ; GFX12-GISEL-NEXT: flat_load_b32 v10, v[2:3] ; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s8 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, s9 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7] ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 @@ -572,15 +575,17 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x42004600 ; GFX12-GISEL-NEXT: s_mov_b32 s9, 0x44004700 ; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x45004800 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v5, s10 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 2, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-GISEL-NEXT: s_mov_b32 s2, 2.0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -589,9 +594,9 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX12-GISEL-NEXT: flat_load_b32 v6, v[0:1] ; GFX12-GISEL-NEXT: flat_load_b32 v7, v[2:3] ; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s8 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, s9 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 @@ -732,29 +737,29 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_mov_b32 s5, 1.0 ; GFX12-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v9, 0xb36211c7 -; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000 -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000 ; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x40a00000 ; GFX12-GISEL-NEXT: s_mov_b32 s9, 4.0 +; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000 ; GFX12-GISEL-NEXT: s_mov_b32 s14, 0x41000000 ; GFX12-GISEL-NEXT: s_mov_b32 s13, 0x40e00000 +; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, s12 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v9, 0xb36211c7 ; GFX12-GISEL-NEXT: v_bfrev_b32_e32 v10, 4.0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v3, s8 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v5, s10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s8 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-GISEL-NEXT: flat_load_b32 v11, v[0:1] -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-GISEL-NEXT: flat_load_b32 v11, v[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3] ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 @@ -886,25 +891,25 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_mov_b32 s5, 1.0 -; GFX12-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x42004600 +; GFX12-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX12-GISEL-NEXT: s_mov_b32 s9, 0x44004700 -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x45004800 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, 0xb36211c6 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v6, 0xb36211c6 :: v_dual_mov_b32 v5, s10 ; GFX12-GISEL-NEXT: v_bfrev_b32_e32 v7, 4.0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v3, s8 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-GISEL-NEXT: flat_load_b32 v8, v[0:1] -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-GISEL-NEXT: flat_load_b32 v8, v[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll index 193fbdf35ec74..078e6d3fd0078 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll @@ -315,8 +315,8 @@ define amdgpu_cs void @inverse_ballot_branch(i64 inreg %s0_1, i64 inreg %s2, ptr ; SDAG_W64-NEXT: ; %bb.1: ; %if ; SDAG_W64-NEXT: s_add_u32 s0, s0, 1 ; SDAG_W64-NEXT: s_addc_u32 s1, s1, 0 -; SDAG_W64-NEXT: v_mov_b32_e32 v3, s1 ; SDAG_W64-NEXT: v_mov_b32_e32 v2, s0 +; SDAG_W64-NEXT: v_mov_b32_e32 v3, s1 ; SDAG_W64-NEXT: ; %bb.2: ; %endif ; SDAG_W64-NEXT: s_or_b64 exec, exec, s[2:3] ; SDAG_W64-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -343,7 +343,7 @@ define amdgpu_cs void @inverse_ballot_branch(i64 inreg %s0_1, i64 inreg %s2, ptr ; SDAG_W32-NEXT: ; %bb.1: ; %if ; SDAG_W32-NEXT: s_add_u32 s0, s0, 1 ; SDAG_W32-NEXT: s_addc_u32 s1, s1, 0 -; SDAG_W32-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; SDAG_W32-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; SDAG_W32-NEXT: ; %bb.2: ; %endif ; SDAG_W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; SDAG_W32-NEXT: global_store_b64 v[0:1], v[2:3], off diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll index 679b289e13969..574c3c9c2d237 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll @@ -1306,8 +1306,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] @@ -1334,8 +1334,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] @@ -1627,17 +1627,17 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] @@ -1655,17 +1655,17 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] @@ -1750,8 +1750,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] @@ -1778,8 +1778,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll index 2fb677eccc4b3..6aec0859bf3ce 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll @@ -2257,10 +2257,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -2304,10 +2304,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -2467,10 +2467,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -2499,11 +2499,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -2532,10 +2532,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -2564,11 +2564,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -2771,10 +2771,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -2818,10 +2818,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -2982,10 +2982,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -3029,10 +3029,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -3193,10 +3193,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -3240,10 +3240,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -3404,10 +3404,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -3451,10 +3451,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -3614,10 +3614,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3646,11 +3646,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3679,10 +3679,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3711,11 +3711,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3917,10 +3917,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3949,11 +3949,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3982,10 +3982,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4014,11 +4014,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4220,10 +4220,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4252,11 +4252,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4285,10 +4285,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4317,11 +4317,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4523,10 +4523,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4555,11 +4555,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4588,10 +4588,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4620,11 +4620,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll index 13a96cfa6e650..147086a00cee8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s + ; FIXME: bfloat vector arguments are broken in globalisel. ; https://github.com/llvm/llvm-project/issues/77055 @@ -22,8 +23,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x ; GCN-NEXT: v_mov_b64_e32 v[8:9], s[24:25] ; GCN-NEXT: v_mov_b64_e32 v[10:11], s[26:27] ; GCN-NEXT: v_mov_b64_e32 v[12:13], s[28:29] -; GCN-NEXT: v_accvgpr_write_b32 a0, s8 ; GCN-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; GCN-NEXT: v_accvgpr_write_b32 a0, s8 ; GCN-NEXT: v_accvgpr_write_b32 a1, s9 ; GCN-NEXT: v_accvgpr_write_b32 a2, s10 ; GCN-NEXT: v_accvgpr_write_b32 a3, s11 @@ -94,8 +95,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0 ; GCN-NEXT: v_mov_b64_e32 v[8:9], s[24:25] ; GCN-NEXT: v_mov_b64_e32 v[10:11], s[26:27] ; GCN-NEXT: v_mov_b64_e32 v[12:13], s[28:29] -; GCN-NEXT: v_accvgpr_write_b32 a0, s8 ; GCN-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; GCN-NEXT: v_accvgpr_write_b32 a0, s8 ; GCN-NEXT: v_accvgpr_write_b32 a1, s9 ; GCN-NEXT: v_accvgpr_write_b32 a2, s10 ; GCN-NEXT: v_accvgpr_write_b32 a3, s11 @@ -248,55 +249,62 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GCN-NEXT: v_mov_b32_e32 v36, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[40:41], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[38:39], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[44:45], s[30:31] -; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GCN-NEXT: v_mov_b64_e32 v[42:43], s[28:29] -; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GCN-NEXT: v_mov_b32_e32 v32, s20 -; GCN-NEXT: v_mov_b32_e32 v33, s21 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[38:41], v[42:45], v[16:31] -; GCN-NEXT: v_mov_b32_e32 v34, s22 -; GCN-NEXT: v_mov_b32_e32 v35, s23 -; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 2 -; GCN-NEXT: v_mov_b32_e32 v16, s16 -; GCN-NEXT: v_mov_b32_e32 v17, s17 -; GCN-NEXT: v_mov_b32_e32 v18, s18 -; GCN-NEXT: v_mov_b32_e32 v19, s19 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[30:31] +; GCN-NEXT: v_accvgpr_write_b32 a31, s23 +; GCN-NEXT: v_accvgpr_write_b32 a30, s22 +; GCN-NEXT: v_accvgpr_write_b32 a29, s21 +; GCN-NEXT: v_accvgpr_write_b32 a28, s20 +; GCN-NEXT: v_accvgpr_write_b32 a27, s19 +; GCN-NEXT: v_accvgpr_write_b32 a26, s18 +; GCN-NEXT: v_accvgpr_write_b32 a25, s17 +; GCN-NEXT: v_accvgpr_write_b32 a24, s16 +; GCN-NEXT: v_accvgpr_write_b32 a23, s15 +; GCN-NEXT: v_accvgpr_write_b32 a22, s14 +; GCN-NEXT: v_accvgpr_write_b32 a21, s13 +; GCN-NEXT: v_accvgpr_write_b32 a20, s12 +; GCN-NEXT: v_accvgpr_write_b32 a19, s11 +; GCN-NEXT: v_accvgpr_write_b32 a18, s10 +; GCN-NEXT: v_accvgpr_write_b32 a17, s9 +; GCN-NEXT: v_accvgpr_write_b32 a16, s8 +; GCN-NEXT: v_mov_b32_e32 v10, s20 +; GCN-NEXT: v_mov_b32_e32 v11, s21 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[2:5], v[6:9], a[16:31] +; GCN-NEXT: v_mov_b32_e32 v12, s22 +; GCN-NEXT: v_mov_b32_e32 v13, s23 +; GCN-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NEXT: v_mov_b32_e32 v4, s18 +; GCN-NEXT: v_mov_b32_e32 v5, s19 +; GCN-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v16, s12 -; GCN-NEXT: v_mov_b32_e32 v17, s13 -; GCN-NEXT: v_mov_b32_e32 v18, s14 -; GCN-NEXT: v_mov_b32_e32 v19, s15 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NEXT: v_mov_b32_e32 v4, s14 +; GCN-NEXT: v_mov_b32_e32 v5, s15 +; GCN-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v16, s8 -; GCN-NEXT: v_mov_b32_e32 v17, s9 -; GCN-NEXT: v_mov_b32_e32 v18, s10 -; GCN-NEXT: v_mov_b32_e32 v19, s11 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) @@ -311,55 +319,62 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GCN-NEXT: v_mov_b32_e32 v36, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[40:41], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[38:39], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[44:45], s[30:31] -; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GCN-NEXT: v_mov_b64_e32 v[42:43], s[28:29] -; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GCN-NEXT: v_mov_b32_e32 v32, s20 -; GCN-NEXT: v_mov_b32_e32 v33, s21 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v34, s22 -; GCN-NEXT: v_mov_b32_e32 v35, s23 -; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 2 -; GCN-NEXT: v_mov_b32_e32 v16, s16 -; GCN-NEXT: v_mov_b32_e32 v17, s17 -; GCN-NEXT: v_mov_b32_e32 v18, s18 -; GCN-NEXT: v_mov_b32_e32 v19, s19 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[30:31] +; GCN-NEXT: v_accvgpr_write_b32 a31, s23 +; GCN-NEXT: v_accvgpr_write_b32 a30, s22 +; GCN-NEXT: v_accvgpr_write_b32 a29, s21 +; GCN-NEXT: v_accvgpr_write_b32 a28, s20 +; GCN-NEXT: v_accvgpr_write_b32 a27, s19 +; GCN-NEXT: v_accvgpr_write_b32 a26, s18 +; GCN-NEXT: v_accvgpr_write_b32 a25, s17 +; GCN-NEXT: v_accvgpr_write_b32 a24, s16 +; GCN-NEXT: v_accvgpr_write_b32 a23, s15 +; GCN-NEXT: v_accvgpr_write_b32 a22, s14 +; GCN-NEXT: v_accvgpr_write_b32 a21, s13 +; GCN-NEXT: v_accvgpr_write_b32 a20, s12 +; GCN-NEXT: v_accvgpr_write_b32 a19, s11 +; GCN-NEXT: v_accvgpr_write_b32 a18, s10 +; GCN-NEXT: v_accvgpr_write_b32 a17, s9 +; GCN-NEXT: v_accvgpr_write_b32 a16, s8 +; GCN-NEXT: v_mov_b32_e32 v10, s20 +; GCN-NEXT: v_mov_b32_e32 v11, s21 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v12, s22 +; GCN-NEXT: v_mov_b32_e32 v13, s23 +; GCN-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NEXT: v_mov_b32_e32 v4, s18 +; GCN-NEXT: v_mov_b32_e32 v5, s19 +; GCN-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v16, s12 -; GCN-NEXT: v_mov_b32_e32 v17, s13 -; GCN-NEXT: v_mov_b32_e32 v18, s14 -; GCN-NEXT: v_mov_b32_e32 v19, s15 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NEXT: v_mov_b32_e32 v4, s14 +; GCN-NEXT: v_mov_b32_e32 v5, s15 +; GCN-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v16, s8 -; GCN-NEXT: v_mov_b32_e32 v17, s9 -; GCN-NEXT: v_mov_b32_e32 v18, s10 -; GCN-NEXT: v_mov_b32_e32 v19, s11 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3) @@ -375,26 +390,34 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat> ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GCN-NEXT: v_accvgpr_write_b32 a0, s8 +; GCN-NEXT: v_accvgpr_write_b32 a1, s9 +; GCN-NEXT: v_accvgpr_write_b32 a2, s10 +; GCN-NEXT: v_accvgpr_write_b32 a3, s11 +; GCN-NEXT: v_accvgpr_write_b32 a4, s12 +; GCN-NEXT: v_accvgpr_write_b32 a5, s13 +; GCN-NEXT: v_accvgpr_write_b32 a6, s14 +; GCN-NEXT: v_accvgpr_write_b32 a7, s15 +; GCN-NEXT: v_accvgpr_write_b32 a8, s16 +; GCN-NEXT: v_accvgpr_write_b32 a9, s17 +; GCN-NEXT: v_accvgpr_write_b32 a10, s18 +; GCN-NEXT: v_accvgpr_write_b32 a11, s19 +; GCN-NEXT: v_accvgpr_write_b32 a12, s20 +; GCN-NEXT: v_accvgpr_write_b32 a13, s21 +; GCN-NEXT: v_accvgpr_write_b32 a14, s22 +; GCN-NEXT: v_accvgpr_write_b32 a15, s23 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] -; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 10 -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) store <16 x float> %result, ptr addrspace(1) %out @@ -408,31 +431,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GCN-NEXT: v_accvgpr_write_b32 a0, s8 +; GCN-NEXT: v_accvgpr_write_b32 a1, s9 +; GCN-NEXT: v_accvgpr_write_b32 a2, s10 +; GCN-NEXT: v_accvgpr_write_b32 a3, s11 +; GCN-NEXT: v_accvgpr_write_b32 a4, s12 +; GCN-NEXT: v_accvgpr_write_b32 a5, s13 +; GCN-NEXT: v_accvgpr_write_b32 a6, s14 +; GCN-NEXT: v_accvgpr_write_b32 a7, s15 +; GCN-NEXT: v_accvgpr_write_b32 a8, s16 +; GCN-NEXT: v_accvgpr_write_b32 a9, s17 +; GCN-NEXT: v_accvgpr_write_b32 a10, s18 +; GCN-NEXT: v_accvgpr_write_b32 a11, s19 +; GCN-NEXT: v_accvgpr_write_b32 a12, s20 +; GCN-NEXT: v_accvgpr_write_b32 a13, s21 +; GCN-NEXT: v_accvgpr_write_b32 a14, s22 +; GCN-NEXT: v_accvgpr_write_b32 a15, s23 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 -; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 10 -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1) store <16 x float> %result, ptr addrspace(1) %out ret void } -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } attributes #1 = { "amdgpu-flat-work-group-size"="1,64" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index ab0000f6831b6..c64845e35fe51 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -4,6 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 --amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=HEURRC %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 --amdgpu-mfma-vgpr-form=1 < %s | FileCheck -enable-var-scope --check-prefixes=VGPRRC %s + declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg) @@ -141,18 +142,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[14:15] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[2:5], v[6:9], a[0:3] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -164,14 +167,16 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] -; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -179,18 +184,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[14:15] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[2:5], v[6:9], a[0:3] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -203,8 +210,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] @@ -260,18 +267,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[14:15] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -283,14 +292,16 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -298,18 +309,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[14:15] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -322,8 +335,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 @@ -389,8 +402,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[24:25] ; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[26:27] ; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[28:29] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 ; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 @@ -455,8 +468,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -472,8 +485,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] @@ -514,8 +527,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[24:25] ; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[26:27] ; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[28:29] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 ; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 ; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 ; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 ; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 @@ -580,8 +593,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[26:27] ; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[24:25] ; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], s[30:31] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[28:29] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] @@ -770,8 +783,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[24:25] ; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[26:27] ; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[28:29] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 ; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 @@ -836,8 +849,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -853,8 +866,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] @@ -895,8 +908,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[24:25] ; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[26:27] ; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[28:29] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 ; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 ; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 ; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 ; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 @@ -961,8 +974,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[26:27] ; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[24:25] ; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], s[30:31] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[28:29] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] @@ -1485,55 +1498,62 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v36, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[40:41], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[44:45], s[30:31] -; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; SDAG-NEXT: v_mov_b64_e32 v[42:43], s[28:29] -; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 +; SDAG-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[2:5], v[6:9], a[16:31] +; SDAG-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-NEXT: v_mov_b32_e32 v2, s16 +; SDAG-NEXT: v_mov_b32_e32 v3, s17 +; SDAG-NEXT: v_mov_b32_e32 v4, s18 +; SDAG-NEXT: v_mov_b32_e32 v5, s19 +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -1542,44 +1562,52 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v56, 0 +; GISEL-NEXT: v_mov_b32_e32 v24, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] -; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -1588,55 +1616,62 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v36, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[40:41], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[44:45], s[30:31] -; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; HEURRC-NEXT: v_mov_b64_e32 v[42:43], s[28:29] -; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; HEURRC-NEXT: v_mov_b32_e32 v32, s20 -; HEURRC-NEXT: v_mov_b32_e32 v33, s21 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] -; HEURRC-NEXT: v_mov_b32_e32 v34, s22 -; HEURRC-NEXT: v_mov_b32_e32 v35, s23 -; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[30:31] +; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 +; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v10, s20 +; HEURRC-NEXT: v_mov_b32_e32 v11, s21 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[2:5], v[6:9], a[16:31] +; HEURRC-NEXT: v_mov_b32_e32 v12, s22 +; HEURRC-NEXT: v_mov_b32_e32 v13, s23 +; HEURRC-NEXT: v_mov_b32_e32 v2, s16 +; HEURRC-NEXT: v_mov_b32_e32 v3, s17 +; HEURRC-NEXT: v_mov_b32_e32 v4, s18 +; HEURRC-NEXT: v_mov_b32_e32 v5, s19 +; HEURRC-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: v_mov_b32_e32 v16, s16 -; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mov_b32_e32 v18, s18 -; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s12 -; HEURRC-NEXT: v_mov_b32_e32 v17, s13 -; HEURRC-NEXT: v_mov_b32_e32 v18, s14 -; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s12 +; HEURRC-NEXT: v_mov_b32_e32 v3, s13 +; HEURRC-NEXT: v_mov_b32_e32 v4, s14 +; HEURRC-NEXT: v_mov_b32_e32 v5, s15 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v17, s9 -; HEURRC-NEXT: v_mov_b32_e32 v18, s10 -; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s8 +; HEURRC-NEXT: v_mov_b32_e32 v3, s9 +; HEURRC-NEXT: v_mov_b32_e32 v4, s10 +; HEURRC-NEXT: v_mov_b32_e32 v5, s11 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -1645,13 +1680,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v36, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v32, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[26:27] -; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[24:25] -; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[26:27] +; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[24:25] +; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[28:29] ; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[28:29] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] ; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] ; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] @@ -1659,41 +1694,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 -; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] -; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v42, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v43, s21 +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[34:37], v[38:41], v[16:31] +; VGPRRC-NEXT: v_mov_b32_e32 v44, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v45, s23 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[42:45], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 2 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd: @@ -1827,55 +1862,62 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v36, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[40:41], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[44:45], s[30:31] -; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; SDAG-NEXT: v_mov_b64_e32 v[42:43], s[28:29] -; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 +; SDAG-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3 +; SDAG-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-NEXT: v_mov_b32_e32 v2, s16 +; SDAG-NEXT: v_mov_b32_e32 v3, s17 +; SDAG-NEXT: v_mov_b32_e32 v4, s18 +; SDAG-NEXT: v_mov_b32_e32 v5, s19 +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -1884,44 +1926,52 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v56, 0 +; GISEL-NEXT: v_mov_b32_e32 v24, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3 -; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3 +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -1930,55 +1980,62 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v36, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[40:41], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[44:45], s[30:31] -; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; HEURRC-NEXT: v_mov_b64_e32 v[42:43], s[28:29] -; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; HEURRC-NEXT: v_mov_b32_e32 v32, s20 -; HEURRC-NEXT: v_mov_b32_e32 v33, s21 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3 -; HEURRC-NEXT: v_mov_b32_e32 v34, s22 -; HEURRC-NEXT: v_mov_b32_e32 v35, s23 -; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[30:31] +; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 +; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v10, s20 +; HEURRC-NEXT: v_mov_b32_e32 v11, s21 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3 +; HEURRC-NEXT: v_mov_b32_e32 v12, s22 +; HEURRC-NEXT: v_mov_b32_e32 v13, s23 +; HEURRC-NEXT: v_mov_b32_e32 v2, s16 +; HEURRC-NEXT: v_mov_b32_e32 v3, s17 +; HEURRC-NEXT: v_mov_b32_e32 v4, s18 +; HEURRC-NEXT: v_mov_b32_e32 v5, s19 +; HEURRC-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: v_mov_b32_e32 v16, s16 -; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mov_b32_e32 v18, s18 -; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s12 -; HEURRC-NEXT: v_mov_b32_e32 v17, s13 -; HEURRC-NEXT: v_mov_b32_e32 v18, s14 -; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s12 +; HEURRC-NEXT: v_mov_b32_e32 v3, s13 +; HEURRC-NEXT: v_mov_b32_e32 v4, s14 +; HEURRC-NEXT: v_mov_b32_e32 v5, s15 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v17, s9 -; HEURRC-NEXT: v_mov_b32_e32 v18, s10 -; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s8 +; HEURRC-NEXT: v_mov_b32_e32 v3, s9 +; HEURRC-NEXT: v_mov_b32_e32 v4, s10 +; HEURRC-NEXT: v_mov_b32_e32 v5, s11 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -1987,13 +2044,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v36, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v32, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[26:27] -; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[24:25] -; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[26:27] +; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[24:25] +; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[28:29] ; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[28:29] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] ; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] ; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] @@ -2001,41 +2058,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 -; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3 -; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v42, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v43, s21 +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[34:37], v[38:41], v[16:31] cbsz:1 abid:2 blgp:3 +; VGPRRC-NEXT: v_mov_b32_e32 v44, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v45, s23 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[42:45], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 2 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags: @@ -2170,26 +2227,34 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 10 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: @@ -2198,26 +2263,34 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] -; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 10 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: @@ -2226,26 +2299,34 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] -; HEURRC-NEXT: v_mov_b32_e32 v16, 0 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_nop 10 -; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: @@ -2257,8 +2338,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25] ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27] ; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] @@ -2351,26 +2432,34 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 10 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: @@ -2379,67 +2468,83 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: s_nop 10 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; GISEL-NEXT: s_endpgm -; -; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: -; HEURRC: ; %bb.0: -; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 -; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 -; HEURRC-NEXT: v_mov_b32_e32 v16, 0 -; HEURRC-NEXT: s_nop 10 -; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] -; HEURRC-NEXT: s_endpgm -; -; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: -; VGPRRC: ; %bb.0: -; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 -; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_nop 10 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GISEL-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: s_nop 10 +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25] ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27] ; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] @@ -2661,24 +2766,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_mov_b32_e32 v8, s0 -; SDAG-NEXT: v_mov_b32_e32 v9, s1 -; SDAG-NEXT: v_mov_b32_e32 v10, s2 -; SDAG-NEXT: v_mov_b32_e32 v11, s3 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] +; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: @@ -2690,14 +2795,16 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] -; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: @@ -2705,24 +2812,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; HEURRC-NEXT: v_mov_b32_e32 v12, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v0, s8 -; HEURRC-NEXT: v_mov_b32_e32 v1, s9 -; HEURRC-NEXT: v_mov_b32_e32 v2, s10 -; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: v_mov_b32_e32 v4, s12 -; HEURRC-NEXT: v_mov_b32_e32 v5, s13 -; HEURRC-NEXT: v_mov_b32_e32 v6, s14 -; HEURRC-NEXT: v_mov_b32_e32 v7, s15 -; HEURRC-NEXT: v_mov_b32_e32 v8, s0 -; HEURRC-NEXT: v_mov_b32_e32 v9, s1 -; HEURRC-NEXT: v_mov_b32_e32 v10, s2 -; HEURRC-NEXT: v_mov_b32_e32 v11, s3 +; HEURRC-NEXT: v_mov_b32_e32 v2, s8 +; HEURRC-NEXT: v_mov_b32_e32 v3, s9 +; HEURRC-NEXT: v_mov_b32_e32 v4, s10 +; HEURRC-NEXT: v_mov_b32_e32 v5, s11 +; HEURRC-NEXT: v_mov_b32_e32 v6, s12 +; HEURRC-NEXT: v_mov_b32_e32 v7, s13 +; HEURRC-NEXT: v_mov_b32_e32 v8, s14 +; HEURRC-NEXT: v_mov_b32_e32 v9, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] +; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: @@ -2730,24 +2837,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 -; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 -; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 -; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 -; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 +; VGPRRC-NEXT: v_mov_b32_e32 v6, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v7, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v8, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v9, s11 +; VGPRRC-NEXT: v_mov_b32_e32 v10, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v11, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v12, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v13, s15 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s0 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s1 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s2 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s3 ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[6:9], v[10:13], v[0:3] ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: ; AGPR: ; %bb.0: @@ -2808,24 +2915,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_mov_b32_e32 v8, s0 -; SDAG-NEXT: v_mov_b32_e32 v9, s1 -; SDAG-NEXT: v_mov_b32_e32 v10, s2 -; SDAG-NEXT: v_mov_b32_e32 v11, s3 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: @@ -2837,14 +2944,16 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: @@ -2852,24 +2961,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; HEURRC-NEXT: v_mov_b32_e32 v12, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v0, s8 -; HEURRC-NEXT: v_mov_b32_e32 v1, s9 -; HEURRC-NEXT: v_mov_b32_e32 v2, s10 -; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: v_mov_b32_e32 v4, s12 -; HEURRC-NEXT: v_mov_b32_e32 v5, s13 -; HEURRC-NEXT: v_mov_b32_e32 v6, s14 -; HEURRC-NEXT: v_mov_b32_e32 v7, s15 -; HEURRC-NEXT: v_mov_b32_e32 v8, s0 -; HEURRC-NEXT: v_mov_b32_e32 v9, s1 -; HEURRC-NEXT: v_mov_b32_e32 v10, s2 -; HEURRC-NEXT: v_mov_b32_e32 v11, s3 +; HEURRC-NEXT: v_mov_b32_e32 v2, s8 +; HEURRC-NEXT: v_mov_b32_e32 v3, s9 +; HEURRC-NEXT: v_mov_b32_e32 v4, s10 +; HEURRC-NEXT: v_mov_b32_e32 v5, s11 +; HEURRC-NEXT: v_mov_b32_e32 v6, s12 +; HEURRC-NEXT: v_mov_b32_e32 v7, s13 +; HEURRC-NEXT: v_mov_b32_e32 v8, s14 +; HEURRC-NEXT: v_mov_b32_e32 v9, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: @@ -2877,24 +2986,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 -; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 -; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 -; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 -; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 +; VGPRRC-NEXT: v_mov_b32_e32 v6, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v7, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v8, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v9, s11 +; VGPRRC-NEXT: v_mov_b32_e32 v10, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v11, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v12, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v13, s15 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s0 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s1 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s2 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s3 ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: ; AGPR: ; %bb.0: @@ -2967,11 +3076,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; SDAG-NEXT: v_mov_b32_e32 v5, s25 ; SDAG-NEXT: v_mov_b32_e32 v6, s26 ; SDAG-NEXT: v_mov_b32_e32 v7, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 ; SDAG-NEXT: v_mov_b32_e32 v8, s28 ; SDAG-NEXT: v_mov_b32_e32 v9, s29 ; SDAG-NEXT: v_mov_b32_e32 v10, s30 ; SDAG-NEXT: v_mov_b32_e32 v11, s31 +; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 ; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 ; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 ; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 @@ -3038,8 +3147,8 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -3055,8 +3164,8 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] @@ -3097,11 +3206,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; HEURRC-NEXT: v_mov_b32_e32 v5, s25 ; HEURRC-NEXT: v_mov_b32_e32 v6, s26 ; HEURRC-NEXT: v_mov_b32_e32 v7, s27 -; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 ; HEURRC-NEXT: v_mov_b32_e32 v8, s28 ; HEURRC-NEXT: v_mov_b32_e32 v9, s29 ; HEURRC-NEXT: v_mov_b32_e32 v10, s30 ; HEURRC-NEXT: v_mov_b32_e32 v11, s31 +; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 ; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 ; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 ; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 @@ -3168,11 +3277,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; VGPRRC-NEXT: v_mov_b32_e32 v37, s25 ; VGPRRC-NEXT: v_mov_b32_e32 v38, s26 ; VGPRRC-NEXT: v_mov_b32_e32 v39, s27 -; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] ; VGPRRC-NEXT: v_mov_b32_e32 v40, s28 ; VGPRRC-NEXT: v_mov_b32_e32 v41, s29 ; VGPRRC-NEXT: v_mov_b32_e32 v42, s30 ; VGPRRC-NEXT: v_mov_b32_e32 v43, s31 +; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] ; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] ; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] @@ -3379,11 +3488,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; SDAG-NEXT: v_mov_b32_e32 v5, s25 ; SDAG-NEXT: v_mov_b32_e32 v6, s26 ; SDAG-NEXT: v_mov_b32_e32 v7, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 ; SDAG-NEXT: v_mov_b32_e32 v8, s28 ; SDAG-NEXT: v_mov_b32_e32 v9, s29 ; SDAG-NEXT: v_mov_b32_e32 v10, s30 ; SDAG-NEXT: v_mov_b32_e32 v11, s31 +; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 ; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 ; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 ; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 @@ -3450,8 +3559,8 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -3467,8 +3576,8 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] @@ -3509,11 +3618,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; HEURRC-NEXT: v_mov_b32_e32 v5, s25 ; HEURRC-NEXT: v_mov_b32_e32 v6, s26 ; HEURRC-NEXT: v_mov_b32_e32 v7, s27 -; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 ; HEURRC-NEXT: v_mov_b32_e32 v8, s28 ; HEURRC-NEXT: v_mov_b32_e32 v9, s29 ; HEURRC-NEXT: v_mov_b32_e32 v10, s30 ; HEURRC-NEXT: v_mov_b32_e32 v11, s31 +; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 ; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 ; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 ; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 @@ -3580,11 +3689,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: v_mov_b32_e32 v37, s25 ; VGPRRC-NEXT: v_mov_b32_e32 v38, s26 ; VGPRRC-NEXT: v_mov_b32_e32 v39, s27 -; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] ; VGPRRC-NEXT: v_mov_b32_e32 v40, s28 ; VGPRRC-NEXT: v_mov_b32_e32 v41, s29 ; VGPRRC-NEXT: v_mov_b32_e32 v42, s30 ; VGPRRC-NEXT: v_mov_b32_e32 v43, s31 +; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] ; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] ; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] @@ -4124,63 +4233,70 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v40, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v36, s24 -; SDAG-NEXT: v_mov_b32_e32 v37, s25 -; SDAG-NEXT: v_mov_b32_e32 v38, s26 -; SDAG-NEXT: v_mov_b32_e32 v39, s27 +; SDAG-NEXT: v_mov_b32_e32 v6, s24 +; SDAG-NEXT: v_mov_b32_e32 v7, s25 +; SDAG-NEXT: v_mov_b32_e32 v8, s26 +; SDAG-NEXT: v_mov_b32_e32 v9, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s16 +; SDAG-NEXT: v_mov_b32_e32 v3, s17 +; SDAG-NEXT: v_mov_b32_e32 v4, s18 +; SDAG-NEXT: v_mov_b32_e32 v5, s19 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -4189,44 +4305,52 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v56, 0 +; GISEL-NEXT: v_mov_b32_e32 v24, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] -; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -4234,63 +4358,70 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v40, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v32, s20 -; HEURRC-NEXT: v_mov_b32_e32 v33, s21 -; HEURRC-NEXT: v_mov_b32_e32 v34, s22 -; HEURRC-NEXT: v_mov_b32_e32 v35, s23 +; HEURRC-NEXT: v_mov_b32_e32 v2, s20 +; HEURRC-NEXT: v_mov_b32_e32 v3, s21 +; HEURRC-NEXT: v_mov_b32_e32 v4, s22 +; HEURRC-NEXT: v_mov_b32_e32 v5, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v36, s24 -; HEURRC-NEXT: v_mov_b32_e32 v37, s25 -; HEURRC-NEXT: v_mov_b32_e32 v38, s26 -; HEURRC-NEXT: v_mov_b32_e32 v39, s27 +; HEURRC-NEXT: v_mov_b32_e32 v6, s24 +; HEURRC-NEXT: v_mov_b32_e32 v7, s25 +; HEURRC-NEXT: v_mov_b32_e32 v8, s26 +; HEURRC-NEXT: v_mov_b32_e32 v9, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 +; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] -; HEURRC-NEXT: s_nop 6 -; HEURRC-NEXT: v_mov_b32_e32 v16, s20 -; HEURRC-NEXT: v_mov_b32_e32 v17, s21 -; HEURRC-NEXT: v_mov_b32_e32 v18, s22 -; HEURRC-NEXT: v_mov_b32_e32 v19, s23 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] +; HEURRC-NEXT: v_mov_b32_e32 v2, s20 +; HEURRC-NEXT: v_mov_b32_e32 v3, s21 +; HEURRC-NEXT: v_mov_b32_e32 v4, s22 +; HEURRC-NEXT: v_mov_b32_e32 v5, s23 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s16 -; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mov_b32_e32 v18, s18 -; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s16 +; HEURRC-NEXT: v_mov_b32_e32 v3, s17 +; HEURRC-NEXT: v_mov_b32_e32 v4, s18 +; HEURRC-NEXT: v_mov_b32_e32 v5, s19 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s12 -; HEURRC-NEXT: v_mov_b32_e32 v17, s13 -; HEURRC-NEXT: v_mov_b32_e32 v18, s14 -; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s12 +; HEURRC-NEXT: v_mov_b32_e32 v3, s13 +; HEURRC-NEXT: v_mov_b32_e32 v4, s14 +; HEURRC-NEXT: v_mov_b32_e32 v5, s15 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v17, s9 -; HEURRC-NEXT: v_mov_b32_e32 v18, s10 -; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s8 +; HEURRC-NEXT: v_mov_b32_e32 v3, s9 +; HEURRC-NEXT: v_mov_b32_e32 v4, s10 +; HEURRC-NEXT: v_mov_b32_e32 v5, s11 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -4298,17 +4429,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v40, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v32, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 -; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v36, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v37, s23 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VGPRRC-NEXT: v_mov_b32_e32 v36, s24 -; VGPRRC-NEXT: v_mov_b32_e32 v37, s25 -; VGPRRC-NEXT: v_mov_b32_e32 v38, s26 -; VGPRRC-NEXT: v_mov_b32_e32 v39, s27 +; VGPRRC-NEXT: v_mov_b32_e32 v38, s24 +; VGPRRC-NEXT: v_mov_b32_e32 v39, s25 +; VGPRRC-NEXT: v_mov_b32_e32 v40, s26 +; VGPRRC-NEXT: v_mov_b32_e32 v41, s27 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) ; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] @@ -4319,42 +4450,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] +; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[34:37], v[38:41], v[16:31] ; VGPRRC-NEXT: s_nop 6 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd: @@ -4501,63 +4632,70 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v40, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v36, s24 -; SDAG-NEXT: v_mov_b32_e32 v37, s25 -; SDAG-NEXT: v_mov_b32_e32 v38, s26 -; SDAG-NEXT: v_mov_b32_e32 v39, s27 +; SDAG-NEXT: v_mov_b32_e32 v6, s24 +; SDAG-NEXT: v_mov_b32_e32 v7, s25 +; SDAG-NEXT: v_mov_b32_e32 v8, s26 +; SDAG-NEXT: v_mov_b32_e32 v9, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3 +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s16 +; SDAG-NEXT: v_mov_b32_e32 v3, s17 +; SDAG-NEXT: v_mov_b32_e32 v4, s18 +; SDAG-NEXT: v_mov_b32_e32 v5, s19 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -4566,44 +4704,52 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v56, 0 +; GISEL-NEXT: v_mov_b32_e32 v24, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3 -; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3 +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -4611,63 +4757,70 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v40, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v32, s20 -; HEURRC-NEXT: v_mov_b32_e32 v33, s21 -; HEURRC-NEXT: v_mov_b32_e32 v34, s22 -; HEURRC-NEXT: v_mov_b32_e32 v35, s23 +; HEURRC-NEXT: v_mov_b32_e32 v2, s20 +; HEURRC-NEXT: v_mov_b32_e32 v3, s21 +; HEURRC-NEXT: v_mov_b32_e32 v4, s22 +; HEURRC-NEXT: v_mov_b32_e32 v5, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v36, s24 -; HEURRC-NEXT: v_mov_b32_e32 v37, s25 -; HEURRC-NEXT: v_mov_b32_e32 v38, s26 -; HEURRC-NEXT: v_mov_b32_e32 v39, s27 +; HEURRC-NEXT: v_mov_b32_e32 v6, s24 +; HEURRC-NEXT: v_mov_b32_e32 v7, s25 +; HEURRC-NEXT: v_mov_b32_e32 v8, s26 +; HEURRC-NEXT: v_mov_b32_e32 v9, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 +; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; HEURRC-NEXT: s_nop 6 -; HEURRC-NEXT: v_mov_b32_e32 v16, s20 -; HEURRC-NEXT: v_mov_b32_e32 v17, s21 -; HEURRC-NEXT: v_mov_b32_e32 v18, s22 -; HEURRC-NEXT: v_mov_b32_e32 v19, s23 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3 +; HEURRC-NEXT: v_mov_b32_e32 v2, s20 +; HEURRC-NEXT: v_mov_b32_e32 v3, s21 +; HEURRC-NEXT: v_mov_b32_e32 v4, s22 +; HEURRC-NEXT: v_mov_b32_e32 v5, s23 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s16 -; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mov_b32_e32 v18, s18 -; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s16 +; HEURRC-NEXT: v_mov_b32_e32 v3, s17 +; HEURRC-NEXT: v_mov_b32_e32 v4, s18 +; HEURRC-NEXT: v_mov_b32_e32 v5, s19 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s12 -; HEURRC-NEXT: v_mov_b32_e32 v17, s13 -; HEURRC-NEXT: v_mov_b32_e32 v18, s14 -; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s12 +; HEURRC-NEXT: v_mov_b32_e32 v3, s13 +; HEURRC-NEXT: v_mov_b32_e32 v4, s14 +; HEURRC-NEXT: v_mov_b32_e32 v5, s15 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v17, s9 -; HEURRC-NEXT: v_mov_b32_e32 v18, s10 -; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s8 +; HEURRC-NEXT: v_mov_b32_e32 v3, s9 +; HEURRC-NEXT: v_mov_b32_e32 v4, s10 +; HEURRC-NEXT: v_mov_b32_e32 v5, s11 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -4675,17 +4828,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v40, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v32, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 -; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v36, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v37, s23 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VGPRRC-NEXT: v_mov_b32_e32 v36, s24 -; VGPRRC-NEXT: v_mov_b32_e32 v37, s25 -; VGPRRC-NEXT: v_mov_b32_e32 v38, s26 -; VGPRRC-NEXT: v_mov_b32_e32 v39, s27 +; VGPRRC-NEXT: v_mov_b32_e32 v38, s24 +; VGPRRC-NEXT: v_mov_b32_e32 v39, s25 +; VGPRRC-NEXT: v_mov_b32_e32 v40, s26 +; VGPRRC-NEXT: v_mov_b32_e32 v41, s27 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) ; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] @@ -4696,42 +4849,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[34:37], v[38:41], v[16:31] cbsz:1 abid:2 blgp:3 ; VGPRRC-NEXT: s_nop 6 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags: @@ -4879,32 +5032,40 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v20, s24 -; SDAG-NEXT: v_mov_b32_e32 v21, s25 -; SDAG-NEXT: v_mov_b32_e32 v22, s26 -; SDAG-NEXT: v_mov_b32_e32 v23, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s24 +; SDAG-NEXT: v_mov_b32_e32 v5, s25 +; SDAG-NEXT: v_mov_b32_e32 v6, s26 +; SDAG-NEXT: v_mov_b32_e32 v7, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 10 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: @@ -4913,26 +5074,34 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] -; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 10 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: @@ -4940,32 +5109,40 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v16, s20 -; HEURRC-NEXT: v_mov_b32_e32 v17, s21 -; HEURRC-NEXT: v_mov_b32_e32 v18, s22 -; HEURRC-NEXT: v_mov_b32_e32 v19, s23 +; HEURRC-NEXT: v_mov_b32_e32 v0, s20 +; HEURRC-NEXT: v_mov_b32_e32 v1, s21 +; HEURRC-NEXT: v_mov_b32_e32 v2, s22 +; HEURRC-NEXT: v_mov_b32_e32 v3, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v20, s24 -; HEURRC-NEXT: v_mov_b32_e32 v21, s25 -; HEURRC-NEXT: v_mov_b32_e32 v22, s26 -; HEURRC-NEXT: v_mov_b32_e32 v23, s27 +; HEURRC-NEXT: v_mov_b32_e32 v4, s24 +; HEURRC-NEXT: v_mov_b32_e32 v5, s25 +; HEURRC-NEXT: v_mov_b32_e32 v6, s26 +; HEURRC-NEXT: v_mov_b32_e32 v7, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] -; HEURRC-NEXT: v_mov_b32_e32 v16, 0 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_nop 10 -; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: @@ -5085,32 +5262,40 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v20, s24 -; SDAG-NEXT: v_mov_b32_e32 v21, s25 -; SDAG-NEXT: v_mov_b32_e32 v22, s26 -; SDAG-NEXT: v_mov_b32_e32 v23, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s24 +; SDAG-NEXT: v_mov_b32_e32 v5, s25 +; SDAG-NEXT: v_mov_b32_e32 v6, s26 +; SDAG-NEXT: v_mov_b32_e32 v7, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 10 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: @@ -5119,26 +5304,34 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 10 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: @@ -5146,32 +5339,40 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v16, s20 -; HEURRC-NEXT: v_mov_b32_e32 v17, s21 -; HEURRC-NEXT: v_mov_b32_e32 v18, s22 -; HEURRC-NEXT: v_mov_b32_e32 v19, s23 +; HEURRC-NEXT: v_mov_b32_e32 v0, s20 +; HEURRC-NEXT: v_mov_b32_e32 v1, s21 +; HEURRC-NEXT: v_mov_b32_e32 v2, s22 +; HEURRC-NEXT: v_mov_b32_e32 v3, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v20, s24 -; HEURRC-NEXT: v_mov_b32_e32 v21, s25 -; HEURRC-NEXT: v_mov_b32_e32 v22, s26 -; HEURRC-NEXT: v_mov_b32_e32 v23, s27 +; HEURRC-NEXT: v_mov_b32_e32 v4, s24 +; HEURRC-NEXT: v_mov_b32_e32 v5, s25 +; HEURRC-NEXT: v_mov_b32_e32 v6, s26 +; HEURRC-NEXT: v_mov_b32_e32 v7, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 -; HEURRC-NEXT: v_mov_b32_e32 v16, 0 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_nop 10 -; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: @@ -5421,18 +5622,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[14:15] +; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_accvgpr_write_b32 a1, s1 +; GCN-NEXT: v_accvgpr_write_b32 a2, s2 +; GCN-NEXT: v_accvgpr_write_b32 a3, s3 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[2:5], v[6:9], a[0:3] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: @@ -5440,18 +5643,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[14:15] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[2:5], v[6:9], a[0:3] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: @@ -5464,8 +5669,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] @@ -5521,18 +5726,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[14:15] +; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_accvgpr_write_b32 a1, s1 +; GCN-NEXT: v_accvgpr_write_b32 a2, s2 +; GCN-NEXT: v_accvgpr_write_b32 a3, s3 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: @@ -5540,18 +5747,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[14:15] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: @@ -5564,8 +5773,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 @@ -5615,5 +5824,5 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ret void } -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } attributes #1 = { "amdgpu-flat-work-group-size"="1,64" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index e7d7f87e4fc4c..a934f0e9c6770 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -1792,9 +1792,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: v_mov_b32_e32 v1, s21 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v2, s22 ; GFX90A-NEXT: v_mov_b32_e32 v3, s23 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 @@ -1829,9 +1829,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v0, s20 ; GFX942-NEXT: v_mov_b32_e32 v1, s21 -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_mov_b32_e32 v2, s22 ; GFX942-NEXT: v_mov_b32_e32 v3, s23 +; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 @@ -1866,9 +1866,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s20 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s21 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s22 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s23 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -1968,9 +1968,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NEXT: v_mov_b32_e32 v3, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX90A-NEXT: v_mov_b32_e32 v4, s6 ; GFX90A-NEXT: v_mov_b32_e32 v5, s7 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11 @@ -1990,9 +1990,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s4 ; GFX942-NEXT: v_mov_b32_e32 v3, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX942-NEXT: v_mov_b32_e32 v4, s6 ; GFX942-NEXT: v_mov_b32_e32 v5, s7 +; GFX942-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s9 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s10 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s11 @@ -2012,9 +2012,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s4 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s5 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s6 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s7 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 @@ -2181,9 +2181,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: v_mov_b32_e32 v1, s21 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v2, s22 ; GFX90A-NEXT: v_mov_b32_e32 v3, s23 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 @@ -2219,9 +2219,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v0, s20 ; GFX942-NEXT: v_mov_b32_e32 v1, s21 -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_mov_b32_e32 v2, s22 ; GFX942-NEXT: v_mov_b32_e32 v3, s23 +; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 @@ -2256,9 +2256,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s20 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s21 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s22 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s23 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -2358,9 +2358,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NEXT: v_mov_b32_e32 v3, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX90A-NEXT: v_mov_b32_e32 v4, s6 ; GFX90A-NEXT: v_mov_b32_e32 v5, s7 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11 @@ -2380,9 +2380,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s4 ; GFX942-NEXT: v_mov_b32_e32 v3, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX942-NEXT: v_mov_b32_e32 v4, s6 ; GFX942-NEXT: v_mov_b32_e32 v5, s7 +; GFX942-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s9 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s10 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s11 @@ -2402,9 +2402,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s4 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s5 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s6 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s7 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 @@ -3328,11 +3328,11 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_imm_src2_1(ptr addrspace( ; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1 -; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_nop 0 @@ -4335,11 +4335,11 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(ptr addrspace(1) %ar ; NOLIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_imm_splat: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 -; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_nop 0 @@ -4938,11 +4938,11 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0 ; NOLIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_imm: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 2.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 -; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_nop 0 @@ -4960,11 +4960,11 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0 ; LIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_imm: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 2.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 -; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: s_nop 0 @@ -5575,8 +5575,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, v1 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[32:33], v[30:31] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[32:33], v[30:31] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[30:31], v[28:29] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[28:29], v[26:27] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[26:27], v[24:25] @@ -5619,11 +5619,11 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 0x42f60000 ; NOLIT-SRCC-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v1 -; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] @@ -5642,11 +5642,11 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 0x42f60000 ; LIT-SRCC-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v1 -; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] @@ -5844,40 +5844,47 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg ; NOLIT-SRCC-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 ; NOLIT-SRCC-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 ; NOLIT-SRCC-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(7) +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v28 +; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(6) +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v24 +; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(5) +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v20 +; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(4) +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v16 +; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(3) +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v12 +; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(2) +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v8 +; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(1) +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 ; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(0) ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v5 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v6 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v7 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v8 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v9 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v10 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v11 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v12 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v13 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v14 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v15 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v16 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v17 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v18 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v19 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v20 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v21 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v22 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v23 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v24 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v25 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v26 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v27 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v28 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v29 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v30 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v31 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3 @@ -5939,40 +5946,47 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg ; LIT-SRCC-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 ; LIT-SRCC-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 ; LIT-SRCC-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; LIT-SRCC-NEXT: s_waitcnt vmcnt(7) +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v28 +; LIT-SRCC-NEXT: s_waitcnt vmcnt(6) +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v24 +; LIT-SRCC-NEXT: s_waitcnt vmcnt(5) +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v20 +; LIT-SRCC-NEXT: s_waitcnt vmcnt(4) +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v16 +; LIT-SRCC-NEXT: s_waitcnt vmcnt(3) +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v12 +; LIT-SRCC-NEXT: s_waitcnt vmcnt(2) +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v8 +; LIT-SRCC-NEXT: s_waitcnt vmcnt(1) +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 ; LIT-SRCC-NEXT: s_waitcnt vmcnt(0) ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v5 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v6 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v7 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v8 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v9 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v10 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v11 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v12 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v13 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v14 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v15 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v16 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v17 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v18 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v19 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v20 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v21 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v22 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v23 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v24 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v25 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v26 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v27 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v28 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v29 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v30 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v31 -; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index aae14c8cc87b3..779bbd0e4d1bf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,SDAG %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s + ; 0 = fp8 ; 1 = bf8 ; 2 = fp6 @@ -1871,36 +1872,36 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v4, s8 +; SDAG-NEXT: v_mov_b32_e32 v5, s9 +; SDAG-NEXT: v_mov_b32_e32 v6, s10 +; SDAG-NEXT: v_mov_b32_e32 v7, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s12 +; SDAG-NEXT: v_mov_b32_e32 v9, s13 +; SDAG-NEXT: v_mov_b32_e32 v10, s14 +; SDAG-NEXT: v_mov_b32_e32 v11, s15 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b32_e32 v12, s16 +; SDAG-NEXT: v_mov_b32_e32 v13, s17 +; SDAG-NEXT: v_mov_b32_e32 v14, s18 +; SDAG-NEXT: v_mov_b32_e32 v15, s19 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: v_mov_b32_e32 v21, s12 -; SDAG-NEXT: v_mov_b32_e32 v22, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_mov_b32_e32 v1, s12 +; SDAG-NEXT: v_mov_b32_e32 v2, s13 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v22 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v1, v2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[14:15] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[14:15] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd: @@ -1913,18 +1914,20 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; GISEL-NEXT: v_mov_b32_e32 v20, s28 -; GISEL-NEXT: v_mov_b32_e32 v21, s29 +; GISEL-NEXT: v_accvgpr_write_b32 a0, s24 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s25 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s26 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s27 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v17, s29 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 10 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[30:31] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -1937,32 +1940,34 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v21, -2 -; SDAG-NEXT: v_mov_b32_e32 v22, 0x41 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, -2 +; SDAG-NEXT: v_mov_b32_e32 v2, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; SDAG-NEXT: v_mov_b32_e32 v4, s8 +; SDAG-NEXT: v_mov_b32_e32 v5, s9 +; SDAG-NEXT: v_mov_b32_e32 v6, s10 +; SDAG-NEXT: v_mov_b32_e32 v7, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s12 +; SDAG-NEXT: v_mov_b32_e32 v9, s13 +; SDAG-NEXT: v_mov_b32_e32 v10, s14 +; SDAG-NEXT: v_mov_b32_e32 v11, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s16 +; SDAG-NEXT: v_mov_b32_e32 v13, s17 +; SDAG-NEXT: v_mov_b32_e32 v14, s18 +; SDAG-NEXT: v_mov_b32_e32 v15, s19 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v1 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: @@ -1970,24 +1975,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 -; GISEL-NEXT: v_mov_b32_e32 v20, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v21, -2 +; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v17, -2 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 10 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2000,32 +2007,34 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v21, 1.0 -; SDAG-NEXT: v_mov_b32_e32 v22, 0x41 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; SDAG-NEXT: v_mov_b32_e32 v4, s8 +; SDAG-NEXT: v_mov_b32_e32 v5, s9 +; SDAG-NEXT: v_mov_b32_e32 v6, s10 +; SDAG-NEXT: v_mov_b32_e32 v7, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s12 +; SDAG-NEXT: v_mov_b32_e32 v9, s13 +; SDAG-NEXT: v_mov_b32_e32 v10, s14 +; SDAG-NEXT: v_mov_b32_e32 v11, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s16 +; SDAG-NEXT: v_mov_b32_e32 v13, s17 +; SDAG-NEXT: v_mov_b32_e32 v14, s18 +; SDAG-NEXT: v_mov_b32_e32 v15, s19 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v1 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal: @@ -2033,24 +2042,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 -; GISEL-NEXT: v_mov_b32_e32 v20, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v21, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v17, 1.0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 10 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2063,32 +2074,34 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v21, -2 -; SDAG-NEXT: v_mov_b32_e32 v22, 1.0 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, -2 +; SDAG-NEXT: v_mov_b32_e32 v2, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; SDAG-NEXT: v_mov_b32_e32 v4, s8 +; SDAG-NEXT: v_mov_b32_e32 v5, s9 +; SDAG-NEXT: v_mov_b32_e32 v6, s10 +; SDAG-NEXT: v_mov_b32_e32 v7, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s12 +; SDAG-NEXT: v_mov_b32_e32 v9, s13 +; SDAG-NEXT: v_mov_b32_e32 v10, s14 +; SDAG-NEXT: v_mov_b32_e32 v11, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s16 +; SDAG-NEXT: v_mov_b32_e32 v13, s17 +; SDAG-NEXT: v_mov_b32_e32 v14, s18 +; SDAG-NEXT: v_mov_b32_e32 v15, s19 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v1 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm: @@ -2096,24 +2109,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 -; GISEL-NEXT: v_mov_b32_e32 v20, 1.0 -; GISEL-NEXT: v_mov_b32_e32 v21, -2 +; GISEL-NEXT: v_mov_b32_e32 v16, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v17, -2 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 10 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2126,32 +2141,34 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v21, 0.15915494 -; SDAG-NEXT: v_mov_b32_e32 v22, 1.0 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 0.15915494 +; SDAG-NEXT: v_mov_b32_e32 v2, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; SDAG-NEXT: v_mov_b32_e32 v4, s8 +; SDAG-NEXT: v_mov_b32_e32 v5, s9 +; SDAG-NEXT: v_mov_b32_e32 v6, s10 +; SDAG-NEXT: v_mov_b32_e32 v7, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s12 +; SDAG-NEXT: v_mov_b32_e32 v9, s13 +; SDAG-NEXT: v_mov_b32_e32 v10, s14 +; SDAG-NEXT: v_mov_b32_e32 v11, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s16 +; SDAG-NEXT: v_mov_b32_e32 v13, s17 +; SDAG-NEXT: v_mov_b32_e32 v14, s18 +; SDAG-NEXT: v_mov_b32_e32 v15, s19 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v1 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal: @@ -2159,24 +2176,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 -; GISEL-NEXT: v_mov_b32_e32 v20, 1.0 -; GISEL-NEXT: v_mov_b32_e32 v21, 0.15915494 +; GISEL-NEXT: v_mov_b32_e32 v16, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v17, 0.15915494 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 10 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2549,5 +2568,5 @@ declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index f0205a3a788ed..c342cd140100b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,SDAG %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s + ; 0 = fp8 ; 1 = bf8 ; 2 = fp6 @@ -4600,41 +4601,49 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: v_mov_b32_e32 v20, s12 -; SDAG-NEXT: v_mov_b32_e32 v21, s13 -; SDAG-NEXT: v_mov_b32_e32 v22, s14 -; SDAG-NEXT: v_mov_b32_e32 v23, s15 -; SDAG-NEXT: v_mov_b32_e32 v24, s16 -; SDAG-NEXT: v_mov_b32_e32 v25, s17 -; SDAG-NEXT: v_mov_b32_e32 v26, s18 -; SDAG-NEXT: v_mov_b32_e32 v27, s19 -; SDAG-NEXT: v_mov_b32_e32 v28, s20 -; SDAG-NEXT: v_mov_b32_e32 v29, s21 -; SDAG-NEXT: v_mov_b32_e32 v30, s22 -; SDAG-NEXT: v_mov_b32_e32 v31, s23 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; SDAG-NEXT: v_mov_b32_e32 v32, s0 -; SDAG-NEXT: v_mov_b32_e32 v33, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_mov_b32_e32 v10, s16 +; SDAG-NEXT: v_mov_b32_e32 v11, s17 +; SDAG-NEXT: v_mov_b32_e32 v12, s18 +; SDAG-NEXT: v_mov_b32_e32 v13, s19 +; SDAG-NEXT: v_mov_b32_e32 v14, s20 +; SDAG-NEXT: v_mov_b32_e32 v15, s21 +; SDAG-NEXT: v_mov_b32_e32 v16, s22 +; SDAG-NEXT: v_mov_b32_e32 v17, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s37 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s38 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s39 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s40 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s41 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s42 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s43 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s44 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s45 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s46 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s47 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s48 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s49 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 +; SDAG-NEXT: v_mov_b32_e32 v0, s0 +; SDAG-NEXT: v_mov_b32_e32 v1, s1 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v0, v1 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: @@ -4643,33 +4652,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; GISEL-NEXT: v_mov_b32_e32 v32, s0 -; GISEL-NEXT: v_mov_b32_e32 v33, s1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s36 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s37 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s38 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s39 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s40 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s41 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s42 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s43 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s44 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s45 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s46 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s47 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s48 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s49 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s50 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s51 +; GISEL-NEXT: v_mov_b32_e32 v16, s0 +; GISEL-NEXT: v_mov_b32_e32 v17, s1 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] +; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 +; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 +; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) store <16 x float> %result, ptr addrspace(1) %ptr, align 64 @@ -4681,78 +4698,94 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v32, -2 -; SDAG-NEXT: v_mov_b32_e32 v33, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v0, -2 +; SDAG-NEXT: v_mov_b32_e32 v1, 0x41 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: v_mov_b32_e32 v20, s12 -; SDAG-NEXT: v_mov_b32_e32 v21, s13 -; SDAG-NEXT: v_mov_b32_e32 v22, s14 -; SDAG-NEXT: v_mov_b32_e32 v23, s15 -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; SDAG-NEXT: v_mov_b32_e32 v24, s16 -; SDAG-NEXT: v_mov_b32_e32 v25, s17 -; SDAG-NEXT: v_mov_b32_e32 v26, s18 -; SDAG-NEXT: v_mov_b32_e32 v27, s19 -; SDAG-NEXT: v_mov_b32_e32 v28, s20 -; SDAG-NEXT: v_mov_b32_e32 v29, s21 -; SDAG-NEXT: v_mov_b32_e32 v30, s22 -; SDAG-NEXT: v_mov_b32_e32 v31, s23 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_mov_b32_e32 v10, s16 +; SDAG-NEXT: v_mov_b32_e32 v11, s17 +; SDAG-NEXT: v_mov_b32_e32 v12, s18 +; SDAG-NEXT: v_mov_b32_e32 v13, s19 +; SDAG-NEXT: v_mov_b32_e32 v14, s20 +; SDAG-NEXT: v_mov_b32_e32 v15, s21 +; SDAG-NEXT: v_mov_b32_e32 v16, s22 +; SDAG-NEXT: v_mov_b32_e32 v17, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s37 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s38 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s39 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s40 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s41 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s42 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s43 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s44 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s45 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s46 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s47 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s48 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s49 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v1, v0 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v32, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v33, -2 +; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v17, -2 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s36 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s37 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s38 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s39 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s40 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s41 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s42 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s43 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s44 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s45 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s46 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s47 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s48 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s49 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s50 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s51 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2) store <16 x float> %result, ptr addrspace(1) %ptr, align 64 @@ -4781,9 +4814,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; SDAG-NEXT: v_mov_b32_e32 v14, s24 ; SDAG-NEXT: v_mov_b32_e32 v15, s25 ; SDAG-NEXT: v_mov_b32_e32 v16, s26 +; SDAG-NEXT: v_mov_b32_e32 v17, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s27 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 @@ -4854,10 +4887,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -4880,12 +4913,12 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) @@ -5005,10 +5038,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -5029,12 +5062,12 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0] blgp:2 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) @@ -5065,71 +5098,77 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v32, s12 -; SDAG-NEXT: v_mov_b32_e32 v33, s13 -; SDAG-NEXT: v_mov_b32_e32 v34, s14 -; SDAG-NEXT: v_mov_b32_e32 v35, s15 -; SDAG-NEXT: v_mov_b32_e32 v36, s16 -; SDAG-NEXT: v_mov_b32_e32 v37, s17 -; SDAG-NEXT: v_mov_b32_e32 v38, s18 -; SDAG-NEXT: v_mov_b32_e32 v39, s19 -; SDAG-NEXT: v_mov_b32_e32 v40, s20 -; SDAG-NEXT: v_mov_b32_e32 v41, s21 -; SDAG-NEXT: v_mov_b32_e32 v42, s22 -; SDAG-NEXT: v_mov_b32_e32 v43, s23 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mov_b32_e32 v10, s22 +; SDAG-NEXT: v_mov_b32_e32 v11, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v44, s24 -; SDAG-NEXT: v_mov_b32_e32 v45, s25 -; SDAG-NEXT: v_mov_b32_e32 v46, s26 -; SDAG-NEXT: v_mov_b32_e32 v47, s27 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v13, s25 +; SDAG-NEXT: v_mov_b32_e32 v14, s26 +; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 -; SDAG-NEXT: s_nop 14 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 +; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 -; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 +; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v10, s10 +; SDAG-NEXT: v_mov_b32_e32 v11, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s8 +; SDAG-NEXT: v_mov_b32_e32 v9, s9 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5137,45 +5176,61 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 -; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 -; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 -; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_accvgpr_write_b32 a31, s23 +; GISEL-NEXT: v_accvgpr_write_b32 a30, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a29, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a28, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a27, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a26, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a25, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a24, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a23, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a22, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a21, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a20, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a19, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a18, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a17, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a16, s8 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0) @@ -5188,73 +5243,80 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 -; SDAG-NEXT: v_mov_b32_e32 v32, 42 -; SDAG-NEXT: v_mov_b32_e32 v33, 25 +; SDAG-NEXT: v_mov_b32_e32 v0, 42 +; SDAG-NEXT: v_mov_b32_e32 v1, 25 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s16 -; SDAG-NEXT: v_mov_b32_e32 v21, s17 -; SDAG-NEXT: v_mov_b32_e32 v22, s18 -; SDAG-NEXT: v_mov_b32_e32 v23, s19 -; SDAG-NEXT: v_mov_b32_e32 v24, s20 -; SDAG-NEXT: v_mov_b32_e32 v25, s21 -; SDAG-NEXT: v_mov_b32_e32 v26, s22 -; SDAG-NEXT: v_mov_b32_e32 v27, s23 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s16 +; SDAG-NEXT: v_mov_b32_e32 v7, s17 +; SDAG-NEXT: v_mov_b32_e32 v8, s18 +; SDAG-NEXT: v_mov_b32_e32 v9, s19 +; SDAG-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-NEXT: v_mov_b32_e32 v13, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v28, s24 -; SDAG-NEXT: v_mov_b32_e32 v29, s25 -; SDAG-NEXT: v_mov_b32_e32 v30, s26 -; SDAG-NEXT: v_mov_b32_e32 v31, s27 +; SDAG-NEXT: v_mov_b32_e32 v14, s24 +; SDAG-NEXT: v_mov_b32_e32 v15, s25 +; SDAG-NEXT: v_mov_b32_e32 v16, s26 +; SDAG-NEXT: v_mov_b32_e32 v17, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v1, v0 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 +; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 -; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 +; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v10, s10 +; SDAG-NEXT: v_mov_b32_e32 v11, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s8 +; SDAG-NEXT: v_mov_b32_e32 v9, s9 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5262,54 +5324,62 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v32, 25 -; GISEL-NEXT: v_mov_b32_e32 v33, 42 -; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 +; GISEL-NEXT: v_mov_b32_e32 v20, 25 +; GISEL-NEXT: v_mov_b32_e32 v21, 42 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0] blgp:2 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 -; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel_hi:[0,0,0] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) @@ -6370,6 +6440,6 @@ declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } attributes #1 = { "amdgpu-flat-work-group-size"="128,128" } attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll index dbe95a8091932..e36b2181bf5c0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll @@ -124,8 +124,8 @@ define amdgpu_ps void @test_call(ptr addrspace(1) inreg %ptr) { ; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 ; GFX9-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-SDAG-NEXT: s_mov_b32 s6, src_pops_exiting_wave_id -; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-SDAG-NEXT: s_mov_b64 s[8:9], 36 +; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-SDAG-NEXT: s_mov_b32 s32, 0 @@ -155,31 +155,51 @@ define amdgpu_ps void @test_call(ptr addrspace(1) inreg %ptr) { ; GFX9-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; -; GFX10-LABEL: test_call: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX10-NEXT: s_mov_b32 s38, -1 -; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s2 -; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 -; GFX10-NEXT: s_mov_b64 s[8:9], 36 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_mov_b32 s0, src_pops_exiting_wave_id -; GFX10-NEXT: s_mov_b32 s32, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: s_endpgm +; GFX10-SDAG-LABEL: test_call: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX10-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX10-SDAG-NEXT: s_mov_b32 s38, -1 +; GFX10-SDAG-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX10-SDAG-NEXT: s_add_u32 s36, s36, s2 +; GFX10-SDAG-NEXT: s_addc_u32 s37, s37, 0 +; GFX10-SDAG-NEXT: s_getpc_b64 s[0:1] +; GFX10-SDAG-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX10-SDAG-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX10-SDAG-NEXT: s_mov_b64 s[8:9], 36 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-SDAG-NEXT: s_mov_b32 s0, src_pops_exiting_wave_id +; GFX10-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX10-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: test_call: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX10-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX10-GISEL-NEXT: s_mov_b32 s38, -1 +; GFX10-GISEL-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX10-GISEL-NEXT: s_add_u32 s36, s36, s2 +; GFX10-GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GFX10-GISEL-NEXT: s_getpc_b64 s[0:1] +; GFX10-GISEL-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX10-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-GISEL-NEXT: s_mov_b32 s0, src_pops_exiting_wave_id +; GFX10-GISEL-NEXT: s_mov_b64 s[8:9], 36 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX10-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.pops.exiting.wave.id() call void @foo(i32 %id) ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll index b9bf76c1423b6..22db8d504f416 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll @@ -244,9 +244,9 @@ define amdgpu_kernel void @test_scc_quadmask_64(i32 %val0, i64 %val1, ptr addrsp ; GFX11-SDAG-NEXT: s_and_b32 s4, s6, 1 ; GFX11-SDAG-NEXT: s_quadmask_b64 s[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_cmp_eq_u32 s4, 0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-SDAG-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 ; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index d1ba892d7f7e1..95ebb856b7aee 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -446,9 +446,9 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -456,14 +456,14 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -480,9 +480,9 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -493,12 +493,12 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -517,9 +517,9 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) { ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm @@ -533,9 +533,9 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) { ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -571,9 +571,9 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -593,10 +593,10 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -605,15 +605,15 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -633,10 +633,10 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -645,15 +645,15 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index 0795f4050b622..d64a0bedc57b6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -global-isel -global-isel-abort=2 < %s | FileCheck --check-prefix=CHECK-GISEL -enable-var-scope %s + declare i32 @llvm.amdgcn.readlane.i32(i32, i32) #0 declare i64 @llvm.amdgcn.readlane.i64(i64, i32) #0 declare double @llvm.amdgcn.readlane.f64(double, i32) #0 @@ -214,9 +215,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 ; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -228,9 +229,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 32 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -247,9 +248,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 ; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -261,9 +262,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -312,8 +313,8 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s2, v1 ; CHECK-GISEL-NEXT: s_nop 3 ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -366,12 +367,12 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s3, v2 ; CHECK-GISEL-NEXT: s_nop 3 ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s3 ; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -426,12 +427,12 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s3, v2 ; CHECK-GISEL-NEXT: s_nop 3 ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s3 ; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -458,9 +459,9 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm @@ -474,9 +475,9 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -490,15 +491,15 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 ; CHECK-SDAG-NEXT: v_readlane_b32 s2, v0, 32 ; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -507,16 +508,16 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v0 ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -536,12 +537,12 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32 ; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32 -; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -549,18 +550,18 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 -; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -580,12 +581,12 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32 ; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32 -; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -593,18 +594,18 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_f64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 -; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -640,9 +641,9 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -657,15 +658,15 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CHECK-SDAG-NEXT: s_endpgm @@ -674,15 +675,15 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -697,15 +698,15 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CHECK-SDAG-NEXT: s_endpgm @@ -714,15 +715,15 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll index e58bf6280a1f2..0df585ea2cc58 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll @@ -36,8 +36,8 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -234,9 +234,9 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -1022,8 +1022,8 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8GISEL-NEXT: s_mul_i32 s3, s3, s5 ; GFX8GISEL-NEXT: s_add_u32 s5, s2, s3 ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm @@ -1167,8 +1167,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s3, s4 ; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s4 ; GFX1132DAGISEL-NEXT: s_add_u32 s3, s5, s3 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1132DAGISEL-NEXT: s_endpgm ; @@ -1199,8 +1200,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12DAGISEL-NEXT: s_mul_i32 s3, s3, s4 ; GFX12DAGISEL-NEXT: s_mul_i32 s2, s2, s4 ; GFX12DAGISEL-NEXT: s_add_co_u32 s3, s5, s3 -; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12DAGISEL-NEXT: s_endpgm entry: @@ -1559,8 +1561,8 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: .LBB5_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll index f39dd867f9580..bbea318026ce6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll @@ -158,9 +158,9 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -818,9 +818,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3 ; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8DAGISEL-NEXT: s_endpgm @@ -830,8 +830,8 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm @@ -899,9 +899,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX1132DAGISEL-LABEL: uniform_value_i64: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1132DAGISEL-NEXT: s_endpgm ; @@ -1204,8 +1204,8 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: .LBB5_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll index 5d408dc65d68b..0e492c3b3f1d0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll @@ -41,8 +41,8 @@ define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX8GISEL-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll index 29dfb0b504f81..9213b0b59fa06 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll @@ -41,8 +41,8 @@ define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mul_f32_e64 v0, -s2, v0 ; GFX8GISEL-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll index 6f299ab8bb9cf..5488b123f0f12 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll @@ -158,9 +158,9 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -818,9 +818,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3 ; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8DAGISEL-NEXT: s_endpgm @@ -830,8 +830,8 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm @@ -899,9 +899,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX1132DAGISEL-LABEL: uniform_value_i64: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1132DAGISEL-NEXT: s_endpgm ; @@ -1262,8 +1262,8 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: .LBB5_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll index 3c4cbc74aedc1..65512fd382b09 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll @@ -158,9 +158,9 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -818,9 +818,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3 ; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8DAGISEL-NEXT: s_endpgm @@ -830,8 +830,8 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm @@ -899,9 +899,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX1132DAGISEL-LABEL: uniform_value_i64: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1132DAGISEL-NEXT: s_endpgm ; @@ -1262,8 +1262,8 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: .LBB5_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll index d6ccf7ce2831d..40fa80ff823b6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll @@ -158,9 +158,9 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -818,9 +818,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3 ; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8DAGISEL-NEXT: s_endpgm @@ -830,8 +830,8 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm @@ -899,9 +899,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX1132DAGISEL-LABEL: uniform_value_i64: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1132DAGISEL-NEXT: s_endpgm ; @@ -1205,8 +1205,8 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: .LBB5_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll index f094213731684..84194714b95c7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll @@ -38,8 +38,8 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_sub_i32 s3, 0, s6 ; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -249,9 +249,9 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -1058,8 +1058,8 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8GISEL-NEXT: s_add_u32 s2, s2, s3 ; GFX8GISEL-NEXT: s_add_u32 s5, s2, s6 ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm @@ -1078,8 +1078,8 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9DAGISEL-NEXT: s_mul_i32 s3, s3, s5 ; GFX9DAGISEL-NEXT: s_add_u32 s2, s2, s3 ; GFX9DAGISEL-NEXT: s_add_u32 s5, s2, s6 -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9DAGISEL-NEXT: s_endpgm @@ -1242,8 +1242,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX1132DAGISEL-NEXT: s_add_u32 s3, s6, s3 ; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s4 ; GFX1132DAGISEL-NEXT: s_add_u32 s3, s3, s5 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1132DAGISEL-NEXT: s_endpgm ; @@ -1284,8 +1285,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12DAGISEL-NEXT: s_add_co_u32 s3, s6, s3 ; GFX12DAGISEL-NEXT: s_mul_i32 s2, s2, s4 ; GFX12DAGISEL-NEXT: s_add_co_u32 s3, s3, s5 -; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12DAGISEL-NEXT: s_endpgm entry: @@ -1661,8 +1663,8 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: .LBB5_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll index 54c8e2e248f57..17c3ca584b5d1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll @@ -159,9 +159,9 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -819,9 +819,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3 ; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8DAGISEL-NEXT: s_endpgm @@ -831,8 +831,8 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm @@ -900,9 +900,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX1132DAGISEL-LABEL: uniform_value_i64: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1132DAGISEL-NEXT: s_endpgm ; @@ -1391,8 +1391,8 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: .LBB7_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll index 502ef84449751..ec2fb68273270 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll @@ -159,9 +159,9 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -819,9 +819,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3 ; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8DAGISEL-NEXT: s_endpgm @@ -831,8 +831,8 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm @@ -900,9 +900,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX1132DAGISEL-LABEL: uniform_value_i64: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1132DAGISEL-NEXT: s_endpgm ; @@ -1251,8 +1251,8 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: .LBB5_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll index d5f1750c268ab..9ed34b95908a7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s + define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: uniform_value: ; GFX8DAGISEL: ; %bb.0: ; %entry @@ -37,8 +38,8 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -200,19 +201,153 @@ entry: ret void } +define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: const_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: const_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: const_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: const_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: const_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: const_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: const_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: const_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: const_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: const_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 123, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: poison_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: poison_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: poison_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: poison_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: poison_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: poison_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_endpgm +; +; GFX11DAGISEL-LABEL: poison_value: +; GFX11DAGISEL: ; %bb.0: ; %entry +; GFX11DAGISEL-NEXT: s_endpgm +; +; GFX11GISEL-LABEL: poison_value: +; GFX11GISEL: ; %bb.0: ; %entry +; GFX11GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 poison, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8DAGISEL-LABEL: divergent_value: ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0 -; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX8DAGISEL-NEXT: s_xor_b32 s4, s4, s6 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8DAGISEL-NEXT: ; %bb.2: ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -226,17 +361,17 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s4, 0 -; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX8GISEL-NEXT: s_xor_b32 s4, s4, s6 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -247,13 +382,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0 -; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX9DAGISEL-NEXT: s_xor_b32 s4, s4, s6 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9DAGISEL-NEXT: ; %bb.2: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -265,13 +400,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX9GISEL-NEXT: s_xor_b32 s4, s4, s6 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9GISEL-NEXT: ; %bb.2: ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -285,13 +420,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0 -; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1064DAGISEL-NEXT: s_xor_b32 s4, s4, s6 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -303,13 +438,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s4, 0 -; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1064GISEL-NEXT: s_xor_b32 s4, s4, s6 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064GISEL-NEXT: ; %bb.2: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -323,13 +458,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0 -; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1032DAGISEL-NEXT: s_xor_b32 s2, s2, s5 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -341,13 +476,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 -; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1032GISEL-NEXT: s_xor_b32 s2, s2, s5 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032GISEL-NEXT: ; %bb.2: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -362,14 +497,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0 -; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1164DAGISEL-NEXT: s_xor_b32 s4, s4, s6 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -382,14 +517,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s4, 0 -; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1164GISEL-NEXT: s_xor_b32 s4, s4, s6 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164GISEL-NEXT: ; %bb.2: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -403,14 +538,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0 -; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1132DAGISEL-NEXT: s_xor_b32 s2, s2, s5 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -423,14 +558,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 -; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1132GISEL-NEXT: s_xor_b32 s2, s2, s5 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132GISEL-NEXT: ; %bb.2: ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -450,7 +585,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else ; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -459,24 +594,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX8DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow ; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX8DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8DAGISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -491,7 +626,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -500,20 +635,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mul_i32 s6, s6, s2 -; GFX8GISEL-NEXT: .LBB2_2: ; %Flow +; GFX8GISEL-NEXT: .LBB4_2: ; %Flow ; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s6, 0 -; GFX8GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8GISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX8GISEL-NEXT: .LBB2_5: ; %endif +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8GISEL-NEXT: .LBB4_5: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 @@ -529,7 +664,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else ; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -538,24 +673,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX9DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow ; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX9DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9DAGISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -569,7 +704,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -578,20 +713,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s2 -; GFX9GISEL-NEXT: .LBB2_2: ; %Flow +; GFX9GISEL-NEXT: .LBB4_2: ; %Flow ; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s6, 0 -; GFX9GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9GISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX9GISEL-NEXT: .LBB2_5: ; %endif +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9GISEL-NEXT: .LBB4_5: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -606,7 +741,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -615,24 +750,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX1064DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow ; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX1064DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064DAGISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -646,7 +781,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -655,20 +790,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s2 -; GFX1064GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 -; GFX1064GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064GISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1064GISEL-NEXT: .LBB2_5: ; %endif +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064GISEL-NEXT: .LBB4_5: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -683,7 +818,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 ; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo @@ -692,24 +827,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2 -; GFX1032DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow ; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 -; GFX1032DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032DAGISEL-NEXT: s_xor_b32 s1, s1, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -723,7 +858,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo @@ -732,20 +867,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2 -; GFX1032GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow ; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 -; GFX1032GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032GISEL-NEXT: s_xor_b32 s0, s0, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1032GISEL-NEXT: .LBB2_5: ; %endif +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032GISEL-NEXT: .LBB4_5: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -762,7 +897,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -772,25 +907,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX1164DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow ; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX1164DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164DAGISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -806,7 +941,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -816,21 +951,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mul_i32 s6, s6, s2 -; GFX1164GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 -; GFX1164GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164GISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1164GISEL-NEXT: .LBB2_5: ; %endif +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164GISEL-NEXT: .LBB4_5: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -847,7 +982,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo @@ -857,25 +992,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2 -; GFX1132DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow ; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 -; GFX1132DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_xor_b32 s1, s1, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -891,7 +1026,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo @@ -901,21 +1036,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2 -; GFX1132GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow ; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 -; GFX1132GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_xor_b32 s0, s0, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1132GISEL-NEXT: .LBB2_5: ; %endif +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132GISEL-NEXT: .LBB4_5: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 @@ -940,905 +1075,3 @@ endif: store i32 %combine, ptr addrspace(1) %out ret void } - -define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { -; GFX8DAGISEL-LABEL: uniform_value_i64: -; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8DAGISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX8DAGISEL-NEXT: s_mul_i32 s1, s3, s4 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8DAGISEL-NEXT: s_mul_i32 s0, s2, s4 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; GFX8DAGISEL-NEXT: s_endpgm -; -; GFX8GISEL-LABEL: uniform_value_i64: -; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8GISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX8GISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 -; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; GFX8GISEL-NEXT: s_endpgm -; -; GFX9DAGISEL-LABEL: uniform_value_i64: -; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9DAGISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9DAGISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX9DAGISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9DAGISEL-NEXT: s_endpgm -; -; GFX9GISEL-LABEL: uniform_value_i64: -; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9GISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9GISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX9GISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9GISEL-NEXT: s_endpgm -; -; GFX1064DAGISEL-LABEL: uniform_value_i64: -; GFX1064DAGISEL: ; %bb.0: ; %entry -; GFX1064DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064DAGISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1064DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX1064DAGISEL-NEXT: s_endpgm -; -; GFX1064GISEL-LABEL: uniform_value_i64: -; GFX1064GISEL: ; %bb.0: ; %entry -; GFX1064GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064GISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064GISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX1064GISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX1064GISEL-NEXT: s_endpgm -; -; GFX1032DAGISEL-LABEL: uniform_value_i64: -; GFX1032DAGISEL: ; %bb.0: ; %entry -; GFX1032DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4 -; GFX1032DAGISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX1032DAGISEL-NEXT: s_endpgm -; -; GFX1032GISEL-LABEL: uniform_value_i64: -; GFX1032GISEL: ; %bb.0: ; %entry -; GFX1032GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s4, s4 -; GFX1032GISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX1032GISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX1032GISEL-NEXT: s_endpgm -; -; GFX1164DAGISEL-LABEL: uniform_value_i64: -; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164DAGISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1164DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1164DAGISEL-NEXT: s_endpgm -; -; GFX1164GISEL-LABEL: uniform_value_i64: -; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164GISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX1164GISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1164GISEL-NEXT: s_endpgm -; -; GFX1132DAGISEL-LABEL: uniform_value_i64: -; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4 -; GFX1132DAGISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1132DAGISEL-NEXT: s_endpgm -; -; GFX1132GISEL-LABEL: uniform_value_i64: -; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s4, s4 -; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132GISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX1132GISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1132GISEL-NEXT: s_endpgm -entry: - %result = call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 %in, i32 1) - store i64 %result, ptr addrspace(1) %out - ret void -} - -define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { -; GFX8DAGISEL-LABEL: divergent_value_i64: -; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] -; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s10 -; GFX8DAGISEL-NEXT: v_readlane_b32 s9, v3, s10 -; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s10 -; GFX8DAGISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX8DAGISEL-NEXT: ; %bb.2: -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX8GISEL-LABEL: divergent_value_i64: -; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8GISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX8GISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] -; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s10 -; GFX8GISEL-NEXT: v_readlane_b32 s9, v3, s10 -; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s10 -; GFX8GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX8GISEL-NEXT: ; %bb.2: -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX8GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9DAGISEL-LABEL: divergent_value_i64: -; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] -; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s10 -; GFX9DAGISEL-NEXT: v_readlane_b32 s9, v3, s10 -; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s10 -; GFX9DAGISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX9DAGISEL-NEXT: ; %bb.2: -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9GISEL-LABEL: divergent_value_i64: -; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9GISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX9GISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] -; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s10 -; GFX9GISEL-NEXT: v_readlane_b32 s9, v3, s10 -; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s10 -; GFX9GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX9GISEL-NEXT: ; %bb.2: -; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1064DAGISEL-LABEL: divergent_value_i64: -; GFX1064DAGISEL: ; %bb.0: ; %entry -; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] -; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s10 -; GFX1064DAGISEL-NEXT: v_readlane_b32 s9, v3, s10 -; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s10 -; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX1064DAGISEL-NEXT: ; %bb.2: -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX1064DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1064GISEL-LABEL: divergent_value_i64: -; GFX1064GISEL: ; %bb.0: ; %entry -; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX1064GISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] -; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s10 -; GFX1064GISEL-NEXT: v_readlane_b32 s9, v3, s10 -; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s10 -; GFX1064GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX1064GISEL-NEXT: ; %bb.2: -; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX1064GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1032DAGISEL-LABEL: divergent_value_i64: -; GFX1032DAGISEL: ; %bb.0: ; %entry -; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1032DAGISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6 -; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 -; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v3, s7 -; GFX1032DAGISEL-NEXT: s_bitset0_b32 s6, s7 -; GFX1032DAGISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX1032DAGISEL-NEXT: ; %bb.2: -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1032GISEL-LABEL: divergent_value_i64: -; GFX1032GISEL: ; %bb.0: ; %entry -; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6 -; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s7 -; GFX1032GISEL-NEXT: v_readlane_b32 s9, v3, s7 -; GFX1032GISEL-NEXT: s_bitset0_b32 s6, s7 -; GFX1032GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX1032GISEL-NEXT: ; %bb.2: -; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1164DAGISEL-LABEL: divergent_value_i64: -; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s6, s[2:3] -; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s6 -; GFX1164DAGISEL-NEXT: v_readlane_b32 s5, v3, s6 -; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s6 -; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX1164DAGISEL-NEXT: ; %bb.2: -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1 -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX1164DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1164GISEL-LABEL: divergent_value_i64: -; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164GISEL-NEXT: s_ctz_i32_b64 s6, s[2:3] -; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s6 -; GFX1164GISEL-NEXT: v_readlane_b32 s5, v3, s6 -; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s6 -; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX1164GISEL-NEXT: ; %bb.2: -; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1 -; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX1164GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1132DAGISEL-LABEL: divergent_value_i64: -; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1132DAGISEL-NEXT: s_mov_b64 s[0:1], 0 -; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s3 -; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v3, s3 -; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 -; GFX1132DAGISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX1132DAGISEL-NEXT: ; %bb.2: -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1132GISEL-LABEL: divergent_value_i64: -; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1132GISEL-NEXT: s_mov_b64 s[0:1], 0 -; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s3 -; GFX1132GISEL-NEXT: v_readlane_b32 s5, v3, s3 -; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 -; GFX1132GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX1132GISEL-NEXT: ; %bb.2: -; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31] -entry: - %result = call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 %id.x, i32 1) - store i64 %result, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 %in2) { -; GFX8DAGISEL-LABEL: divergent_cfg_i64: -; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX8DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX8DAGISEL-NEXT: ; %bb.1: ; %else -; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX8DAGISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: s_mul_i32 s6, s2, s7 -; GFX8DAGISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX8DAGISEL-NEXT: .LBB5_2: ; %Flow -; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX8DAGISEL-NEXT: ; %bb.3: ; %if -; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX8DAGISEL-NEXT: s_and_b32 s6, s6, 1 -; GFX8DAGISEL-NEXT: s_mul_i32 s4, s4, s6 -; GFX8DAGISEL-NEXT: s_mul_i32 s5, s5, s6 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX8DAGISEL-NEXT: ; %bb.4: ; %endif -; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 -; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; GFX8DAGISEL-NEXT: s_endpgm -; -; GFX8GISEL-LABEL: divergent_cfg_i64: -; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 -; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX8GISEL-NEXT: ; %bb.1: ; %else -; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX8GISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: s_mul_i32 s6, s2, s7 -; GFX8GISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX8GISEL-NEXT: .LBB5_2: ; %Flow -; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_4 -; GFX8GISEL-NEXT: ; %bb.3: ; %if -; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX8GISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: s_mul_i32 s6, s4, s7 -; GFX8GISEL-NEXT: s_mul_i32 s7, s5, s7 -; GFX8GISEL-NEXT: .LBB5_4: ; %endif -; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 -; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; GFX8GISEL-NEXT: s_endpgm -; -; GFX9DAGISEL-LABEL: divergent_cfg_i64: -; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX9DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX9DAGISEL-NEXT: ; %bb.1: ; %else -; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9DAGISEL-NEXT: s_and_b32 s5, s4, 1 -; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9DAGISEL-NEXT: s_mul_i32 s4, s2, s5 -; GFX9DAGISEL-NEXT: s_mul_i32 s5, s3, s5 -; GFX9DAGISEL-NEXT: .LBB5_2: ; %Flow -; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX9DAGISEL-NEXT: ; %bb.3: ; %if -; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9DAGISEL-NEXT: s_and_b32 s5, s4, 1 -; GFX9DAGISEL-NEXT: s_mul_i32 s4, s6, s5 -; GFX9DAGISEL-NEXT: s_mul_i32 s5, s7, s5 -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9DAGISEL-NEXT: ; %bb.4: ; %endif -; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9DAGISEL-NEXT: s_endpgm -; -; GFX9GISEL-LABEL: divergent_cfg_i64: -; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 -; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX9GISEL-NEXT: ; %bb.1: ; %else -; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX9GISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9GISEL-NEXT: s_mul_i32 s6, s2, s7 -; GFX9GISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX9GISEL-NEXT: .LBB5_2: ; %Flow -; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_4 -; GFX9GISEL-NEXT: ; %bb.3: ; %if -; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9GISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s4 -; GFX9GISEL-NEXT: s_mul_i32 s7, s7, s4 -; GFX9GISEL-NEXT: .LBB5_4: ; %endif -; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9GISEL-NEXT: s_endpgm -; -; GFX1064DAGISEL-LABEL: divergent_cfg_i64: -; GFX1064DAGISEL: ; %bb.0: ; %entry -; GFX1064DAGISEL-NEXT: s_clause 0x1 -; GFX1064DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064DAGISEL-NEXT: s_and_b32 s5, s4, 1 -; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064DAGISEL-NEXT: s_mul_i32 s4, s2, s5 -; GFX1064DAGISEL-NEXT: s_mul_i32 s5, s3, s5 -; GFX1064DAGISEL-NEXT: .LBB5_2: ; %Flow -; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if -; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064DAGISEL-NEXT: s_and_b32 s5, s4, 1 -; GFX1064DAGISEL-NEXT: s_mul_i32 s4, s6, s5 -; GFX1064DAGISEL-NEXT: s_mul_i32 s5, s7, s5 -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1064DAGISEL-NEXT: ; %bb.4: ; %endif -; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX1064DAGISEL-NEXT: s_endpgm -; -; GFX1064GISEL-LABEL: divergent_cfg_i64: -; GFX1064GISEL: ; %bb.0: ; %entry -; GFX1064GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 -; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX1064GISEL-NEXT: ; %bb.1: ; %else -; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064GISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064GISEL-NEXT: s_mul_i32 s6, s2, s7 -; GFX1064GISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow -; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_4 -; GFX1064GISEL-NEXT: ; %bb.3: ; %if -; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064GISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s4 -; GFX1064GISEL-NEXT: s_mul_i32 s7, s7, s4 -; GFX1064GISEL-NEXT: .LBB5_4: ; %endif -; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX1064GISEL-NEXT: s_endpgm -; -; GFX1032DAGISEL-LABEL: divergent_cfg_i64: -; GFX1032DAGISEL: ; %bb.0: ; %entry -; GFX1032DAGISEL-NEXT: s_clause 0x1 -; GFX1032DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 -; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1032DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4 -; GFX1032DAGISEL-NEXT: s_and_b32 s5, s4, 1 -; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032DAGISEL-NEXT: s_mul_i32 s4, s2, s5 -; GFX1032DAGISEL-NEXT: s_mul_i32 s5, s3, s5 -; GFX1032DAGISEL-NEXT: .LBB5_2: ; %Flow -; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s8 -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if -; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo -; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032DAGISEL-NEXT: s_and_b32 s3, s3, 1 -; GFX1032DAGISEL-NEXT: s_mul_i32 s4, s6, s3 -; GFX1032DAGISEL-NEXT: s_mul_i32 s5, s7, s3 -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1032DAGISEL-NEXT: ; %bb.4: ; %endif -; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX1032DAGISEL-NEXT: s_endpgm -; -; GFX1032GISEL-LABEL: divergent_cfg_i64: -; GFX1032GISEL: ; %bb.0: ; %entry -; GFX1032GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 -; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX1032GISEL-NEXT: ; %bb.1: ; %else -; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s6, s6 -; GFX1032GISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: s_mul_i32 s6, s2, s7 -; GFX1032GISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow -; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s2, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_4 -; GFX1032GISEL-NEXT: ; %bb.3: ; %if -; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo -; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032GISEL-NEXT: s_and_b32 s3, s3, 1 -; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: s_mul_i32 s6, s6, s3 -; GFX1032GISEL-NEXT: s_mul_i32 s7, s7, s3 -; GFX1032GISEL-NEXT: .LBB5_4: ; %endif -; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX1032GISEL-NEXT: s_endpgm -; -; GFX1164DAGISEL-LABEL: divergent_cfg_i64: -; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_clause 0x1 -; GFX1164DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164DAGISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164DAGISEL-NEXT: s_mov_b64 s[8:9], exec -; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1164DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1164DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164DAGISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: s_mul_i32 s6, s2, s7 -; GFX1164DAGISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX1164DAGISEL-NEXT: .LBB5_2: ; %Flow -; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164DAGISEL-NEXT: s_and_b32 s6, s6, 1 -; GFX1164DAGISEL-NEXT: s_mul_i32 s4, s4, s6 -; GFX1164DAGISEL-NEXT: s_mul_i32 s5, s5, s6 -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1164DAGISEL-NEXT: ; %bb.4: ; %endif -; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1164DAGISEL-NEXT: s_endpgm -; -; GFX1164GISEL-LABEL: divergent_cfg_i64: -; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164GISEL-NEXT: s_mov_b64 s[8:9], exec -; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX1164GISEL-NEXT: ; %bb.1: ; %else -; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164GISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: s_mul_i32 s6, s2, s7 -; GFX1164GISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow -; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_4 -; GFX1164GISEL-NEXT: ; %bb.3: ; %if -; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164GISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: s_mul_i32 s6, s4, s7 -; GFX1164GISEL-NEXT: s_mul_i32 s7, s5, s7 -; GFX1164GISEL-NEXT: .LBB5_4: ; %endif -; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1164GISEL-NEXT: s_endpgm -; -; GFX1132DAGISEL-LABEL: divergent_cfg_i64: -; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_clause 0x1 -; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132DAGISEL-NEXT: s_mov_b32 s8, exec_lo -; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1132DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1132DAGISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s6, s6 -; GFX1132DAGISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: s_mul_i32 s6, s2, s7 -; GFX1132DAGISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX1132DAGISEL-NEXT: .LBB5_2: ; %Flow -; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s8 -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo -; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132DAGISEL-NEXT: s_and_b32 s3, s3, 1 -; GFX1132DAGISEL-NEXT: s_mul_i32 s4, s4, s3 -; GFX1132DAGISEL-NEXT: s_mul_i32 s5, s5, s3 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1132DAGISEL-NEXT: ; %bb.4: ; %endif -; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1132DAGISEL-NEXT: s_endpgm -; -; GFX1132GISEL-LABEL: divergent_cfg_i64: -; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132GISEL-NEXT: s_mov_b32 s8, exec_lo -; GFX1132GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX1132GISEL-NEXT: ; %bb.1: ; %else -; GFX1132GISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s6, s6 -; GFX1132GISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: s_mul_i32 s6, s2, s7 -; GFX1132GISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow -; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s2, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_4 -; GFX1132GISEL-NEXT: ; %bb.3: ; %if -; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo -; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132GISEL-NEXT: s_and_b32 s3, s3, 1 -; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: s_mul_i32 s6, s4, s3 -; GFX1132GISEL-NEXT: s_mul_i32 s7, s5, s3 -; GFX1132GISEL-NEXT: .LBB5_4: ; %endif -; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1132GISEL-NEXT: s_endpgm -entry: - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %d_cmp = icmp ult i32 %tid, 16 - br i1 %d_cmp, label %if, label %else - -if: - %reducedValTid = call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 %in2, i32 1) - br label %endif - -else: - %reducedValIn = call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 %in, i32 1) - br label %endif - -endif: - %combine = phi i64 [%reducedValTid, %if], [%reducedValIn, %else] - store i64 %combine, ptr addrspace(1) %out - ret void -} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10DAGISEL: {{.*}} -; GFX10GISEL: {{.*}} -; GFX11DAGISEL: {{.*}} -; GFX11GISEL: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll index 20523476a29d5..07736f01b4166 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll @@ -12,43 +12,45 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; GCN-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 +; GCN-NEXT: ds_load_b128 v[0:3], v32 ; GCN-NEXT: ds_load_b128 v[4:7], v32 offset:16 ; GCN-NEXT: ds_load_b128 v[12:15], v32 offset:2064 -; GCN-NEXT: ds_load_b128 v[20:23], v32 offset:6160 -; GCN-NEXT: ds_load_b128 v[28:31], v32 offset:12304 -; GCN-NEXT: ds_load_b128 v[36:39], v32 offset:20496 -; GCN-NEXT: ds_load_b128 v[0:3], v32 ; GCN-NEXT: ds_load_b128 v[8:11], v32 offset:2048 +; GCN-NEXT: ds_load_b128 v[20:23], v32 offset:6160 ; GCN-NEXT: ds_load_b128 v[16:19], v32 offset:6144 +; GCN-NEXT: ds_load_b128 v[28:31], v32 offset:12304 ; GCN-NEXT: ds_load_b128 v[24:27], v32 offset:12288 +; GCN-NEXT: ds_load_b128 v[36:39], v32 offset:20496 ; GCN-NEXT: ds_load_b128 v[32:35], v32 offset:20480 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(10) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(4) -; GCN-NEXT: v_mov_b32_e32 v47, v7 -; GCN-NEXT: s_waitcnt lgkmcnt(3) -; GCN-NEXT: v_mov_b32_e32 v55, v15 -; GCN-NEXT: s_waitcnt lgkmcnt(2) -; GCN-NEXT: v_mov_b32_e32 v63, v23 -; GCN-NEXT: s_waitcnt lgkmcnt(1) -; GCN-NEXT: v_mov_b32_e32 v71, v31 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_dual_mov_b32 v79, v39 :: v_dual_mov_b32 v46, v6 +; GCN-NEXT: s_waitcnt lgkmcnt(9) +; GCN-NEXT: v_mov_b32_e32 v43, v3 +; GCN-NEXT: s_waitcnt lgkmcnt(8) +; GCN-NEXT: v_dual_mov_b32 v47, v7 :: v_dual_mov_b32 v46, v6 ; GCN-NEXT: v_dual_mov_b32 v45, v5 :: v_dual_mov_b32 v44, v4 -; GCN-NEXT: v_dual_mov_b32 v43, v3 :: v_dual_mov_b32 v42, v2 -; GCN-NEXT: v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0 +; GCN-NEXT: v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v41, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(7) +; GCN-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v55, v15 ; GCN-NEXT: v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v53, v13 +; GCN-NEXT: s_waitcnt lgkmcnt(6) ; GCN-NEXT: v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v51, v11 ; GCN-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v49, v9 -; GCN-NEXT: v_mov_b32_e32 v48, v8 +; GCN-NEXT: s_waitcnt lgkmcnt(5) +; GCN-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v63, v23 ; GCN-NEXT: v_dual_mov_b32 v62, v22 :: v_dual_mov_b32 v61, v21 +; GCN-NEXT: s_waitcnt lgkmcnt(4) ; GCN-NEXT: v_dual_mov_b32 v60, v20 :: v_dual_mov_b32 v59, v19 ; GCN-NEXT: v_dual_mov_b32 v58, v18 :: v_dual_mov_b32 v57, v17 -; GCN-NEXT: v_mov_b32_e32 v56, v16 +; GCN-NEXT: s_waitcnt lgkmcnt(3) +; GCN-NEXT: v_dual_mov_b32 v56, v16 :: v_dual_mov_b32 v71, v31 ; GCN-NEXT: v_dual_mov_b32 v70, v30 :: v_dual_mov_b32 v69, v29 +; GCN-NEXT: s_waitcnt lgkmcnt(2) ; GCN-NEXT: v_dual_mov_b32 v68, v28 :: v_dual_mov_b32 v67, v27 ; GCN-NEXT: v_dual_mov_b32 v66, v26 :: v_dual_mov_b32 v65, v25 -; GCN-NEXT: v_mov_b32_e32 v64, v24 +; GCN-NEXT: s_waitcnt lgkmcnt(1) +; GCN-NEXT: v_dual_mov_b32 v64, v24 :: v_dual_mov_b32 v79, v39 ; GCN-NEXT: v_dual_mov_b32 v78, v38 :: v_dual_mov_b32 v77, v37 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_dual_mov_b32 v76, v36 :: v_dual_mov_b32 v75, v35 ; GCN-NEXT: v_dual_mov_b32 v74, v34 :: v_dual_mov_b32 v73, v33 ; GCN-NEXT: v_mov_b32_e32 v72, v32 @@ -80,43 +82,45 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 +; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v32 ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v32 offset:16 ; EXACTCUTOFF-NEXT: ds_load_b128 v[12:15], v32 offset:2064 -; EXACTCUTOFF-NEXT: ds_load_b128 v[20:23], v32 offset:6160 -; EXACTCUTOFF-NEXT: ds_load_b128 v[28:31], v32 offset:12304 -; EXACTCUTOFF-NEXT: ds_load_b128 v[36:39], v32 offset:20496 -; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v32 ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v32 offset:2048 +; EXACTCUTOFF-NEXT: ds_load_b128 v[20:23], v32 offset:6160 ; EXACTCUTOFF-NEXT: ds_load_b128 v[16:19], v32 offset:6144 +; EXACTCUTOFF-NEXT: ds_load_b128 v[28:31], v32 offset:12304 ; EXACTCUTOFF-NEXT: ds_load_b128 v[24:27], v32 offset:12288 +; EXACTCUTOFF-NEXT: ds_load_b128 v[36:39], v32 offset:20496 ; EXACTCUTOFF-NEXT: ds_load_b128 v[32:35], v32 offset:20480 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(10) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(4) -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v47, v7 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(3) -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v55, v15 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(2) -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v63, v23 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v71, v31 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v79, v39 :: v_dual_mov_b32 v46, v6 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(9) +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v43, v3 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8) +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v47, v7 :: v_dual_mov_b32 v46, v6 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v45, v5 :: v_dual_mov_b32 v44, v4 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v43, v3 :: v_dual_mov_b32 v42, v2 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v41, v1 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(7) +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v55, v15 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v53, v13 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(6) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v51, v11 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v49, v9 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, v8 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(5) +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v63, v23 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v62, v22 :: v_dual_mov_b32 v61, v21 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(4) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v60, v20 :: v_dual_mov_b32 v59, v19 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v58, v18 :: v_dual_mov_b32 v57, v17 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v56, v16 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(3) +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v56, v16 :: v_dual_mov_b32 v71, v31 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v70, v30 :: v_dual_mov_b32 v69, v29 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(2) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v68, v28 :: v_dual_mov_b32 v67, v27 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v66, v26 :: v_dual_mov_b32 v65, v25 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v64, v24 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v64, v24 :: v_dual_mov_b32 v79, v39 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v78, v38 :: v_dual_mov_b32 v77, v37 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v76, v36 :: v_dual_mov_b32 v75, v35 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v74, v34 :: v_dual_mov_b32 v73, v33 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v72, v32 @@ -184,14 +188,16 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; GCN-NEXT: v_add_nc_u32_e32 v16, s1, v16 -; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 ; GCN-NEXT: ds_load_b128 v[0:3], v17 +; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) +; GCN-NEXT: s_waitcnt lgkmcnt(1) +; GCN-NEXT: v_mov_b32_e32 v11, v3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 -; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 -; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GCN-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v9, v1 +; GCN-NEXT: v_mov_b32_e32 v8, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) @@ -202,9 +208,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; GCN-NEXT: v_mov_b32_e32 v16, s1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 ; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -216,9 +223,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:6144 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 ; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -230,9 +238,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:12288 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 ; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -244,9 +253,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:20480 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 ; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -266,14 +276,16 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v16, s1, v16 -; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 +; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v11, v3 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v9, v1 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v8, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) @@ -284,9 +296,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v16, s1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -298,9 +311,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:6144 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -312,9 +326,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:12288 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -326,9 +341,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:20480 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll index dcc3e0df0c744..703661e22b495 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll @@ -24,24 +24,20 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr ; GCN-NEXT: ds_load_b128 v[0:3], v0 offset:11264 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x6 -; GCN-NEXT: v_mov_b32_e32 v31, v11 +; GCN-NEXT: v_dual_mov_b32 v31, v11 :: v_dual_mov_b32 v30, v10 +; GCN-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 ; GCN-NEXT: s_wait_dscnt 0x5 -; GCN-NEXT: v_mov_b32_e32 v35, v15 +; GCN-NEXT: v_dual_mov_b32 v35, v15 :: v_dual_mov_b32 v34, v14 +; GCN-NEXT: v_dual_mov_b32 v33, v13 :: v_dual_mov_b32 v32, v12 ; GCN-NEXT: s_wait_dscnt 0x4 -; GCN-NEXT: v_mov_b32_e32 v39, v19 +; GCN-NEXT: v_dual_mov_b32 v39, v19 :: v_dual_mov_b32 v38, v18 +; GCN-NEXT: v_dual_mov_b32 v37, v17 :: v_dual_mov_b32 v36, v16 ; GCN-NEXT: s_wait_dscnt 0x3 -; GCN-NEXT: v_mov_b32_e32 v43, v23 +; GCN-NEXT: v_dual_mov_b32 v43, v23 :: v_dual_mov_b32 v42, v22 +; GCN-NEXT: v_dual_mov_b32 v41, v21 :: v_dual_mov_b32 v40, v20 ; GCN-NEXT: s_wait_dscnt 0x2 -; GCN-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10 -; GCN-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 -; GCN-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 -; GCN-NEXT: v_mov_b32_e32 v32, v12 -; GCN-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17 -; GCN-NEXT: v_mov_b32_e32 v36, v16 -; GCN-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21 -; GCN-NEXT: v_mov_b32_e32 v40, v20 -; GCN-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25 -; GCN-NEXT: v_mov_b32_e32 v44, v24 +; GCN-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v46, v26 +; GCN-NEXT: v_dual_mov_b32 v45, v25 :: v_dual_mov_b32 v44, v24 ; GCN-NEXT: s_wait_dscnt 0x0 ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48 ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48 @@ -76,24 +72,20 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v0 offset:11264 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x6 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v31, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v31, v11 :: v_dual_mov_b32 v30, v10 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x5 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v35, v15 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v35, v15 :: v_dual_mov_b32 v34, v14 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v33, v13 :: v_dual_mov_b32 v32, v12 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x4 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v39, v19 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v39, v19 :: v_dual_mov_b32 v38, v18 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v37, v17 :: v_dual_mov_b32 v36, v16 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x3 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v43, v23 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v43, v23 :: v_dual_mov_b32 v42, v22 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v41, v21 :: v_dual_mov_b32 v40, v20 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v32, v12 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v36, v16 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v40, v20 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v44, v24 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v46, v26 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v45, v25 :: v_dual_mov_b32 v44, v24 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48 ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll index 4905c6d8aa81b..78ff7e7510b66 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll @@ -91,15 +91,25 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) { } define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) { -; GFX11-LABEL: test_get_tma: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA) -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: test_get_tma: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_get_tma: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm ; ; GFX1250-LABEL: test_get_tma: ; GFX1250: ; %bb.0: @@ -117,15 +127,25 @@ define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) { } define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) { -; GFX11-LABEL: test_get_realtime: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME) -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: test_get_realtime: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_get_realtime: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm ; ; GFX1250-LABEL: test_get_realtime: ; GFX1250: ; %bb.0: @@ -186,15 +206,25 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) { } define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) { -; GFX11-LABEL: test_get_tba: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA) -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: test_get_tba: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_get_tba: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm ; ; GFX1250-LABEL: test_get_tba: ; GFX1250: ; %bb.0: @@ -255,15 +285,25 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) { } define amdgpu_kernel void @test_get_99999_i64(ptr addrspace(1) %out) { -; GFX11-LABEL: test_get_99999_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: test_get_99999_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_get_99999_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm ; ; GFX1250-LABEL: test_get_99999_i64: ; GFX1250: ; %bb.0: @@ -281,15 +321,25 @@ define amdgpu_kernel void @test_get_99999_i64(ptr addrspace(1) %out) { } define amdgpu_kernel void @test_get_136_i64(ptr addrspace(1) %out) { -; GFX11-LABEL: test_get_136_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(136, 0, 0) -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: test_get_136_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(136, 0, 0) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_get_136_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(136, 0, 0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm ; ; GFX1250-LABEL: test_get_136_i64: ; GFX1250: ; %bb.0: @@ -308,3 +358,5 @@ define amdgpu_kernel void @test_get_136_i64(ptr addrspace(1) %out) { declare i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32) declare i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 25996ee11c5a1..1c3c6e3bc3489 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s + declare i32 @llvm.amdgcn.workitem.id.x() ; -------------------------------------------------------------------- @@ -44,20 +45,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b32_e32 v16, s16 +; GISEL-NEXT: v_mov_b32_e32 v12, s16 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -474,7 +475,6 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, ; GISEL-NEXT: v_mov_b32_e32 v18, v7 ; GISEL-NEXT: v_mov_b32_e32 v19, v8 ; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v21, v10 ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 @@ -482,6 +482,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, ; GISEL-NEXT: v_mov_b32_e32 v3, s27 ; GISEL-NEXT: v_mov_b32_e32 v4, s28 ; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] @@ -797,26 +798,26 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v12, 0 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s0 -; SDAG-NEXT: v_mov_b32_e32 v5, s1 -; SDAG-NEXT: v_mov_b32_e32 v6, s2 -; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s16 +; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v14, s8 +; SDAG-NEXT: v_mov_b32_e32 v15, s9 +; SDAG-NEXT: v_mov_b32_e32 v16, s10 +; SDAG-NEXT: v_mov_b32_e32 v17, s11 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s0 +; SDAG-NEXT: v_mov_b32_e32 v7, s1 +; SDAG-NEXT: v_mov_b32_e32 v8, s2 +; SDAG-NEXT: v_mov_b32_e32 v9, s3 +; SDAG-NEXT: v_mov_b32_e32 v1, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__vgpr: @@ -828,20 +829,20 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] +; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v16, s2 +; GISEL-NEXT: v_mov_b32_e32 v12, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -963,22 +964,22 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v24, s8 -; SDAG-NEXT: v_mov_b32_e32 v25, s9 -; SDAG-NEXT: v_mov_b32_e32 v26, s10 -; SDAG-NEXT: v_mov_b32_e32 v27, s11 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s0 -; SDAG-NEXT: v_mov_b32_e32 v21, s1 -; SDAG-NEXT: v_mov_b32_e32 v22, s2 -; SDAG-NEXT: v_mov_b32_e32 v23, s3 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v26, s8 +; SDAG-NEXT: v_mov_b32_e32 v27, s9 +; SDAG-NEXT: v_mov_b32_e32 v28, s10 +; SDAG-NEXT: v_mov_b32_e32 v29, s11 +; SDAG-NEXT: v_mov_b32_e32 v18, s12 +; SDAG-NEXT: v_mov_b32_e32 v19, s13 +; SDAG-NEXT: v_mov_b32_e32 v20, s14 +; SDAG-NEXT: v_mov_b32_e32 v21, s15 +; SDAG-NEXT: v_mov_b32_e32 v22, s0 +; SDAG-NEXT: v_mov_b32_e32 v23, s1 +; SDAG-NEXT: v_mov_b32_e32 v24, s2 +; SDAG-NEXT: v_mov_b32_e32 v25, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 @@ -1264,7 +1265,6 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; GISEL-NEXT: v_mov_b32_e32 v18, v7 ; GISEL-NEXT: v_mov_b32_e32 v19, v8 ; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v21, v10 ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 @@ -1272,6 +1272,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; GISEL-NEXT: v_mov_b32_e32 v3, s27 ; GISEL-NEXT: v_mov_b32_e32 v4, s28 ; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] @@ -1308,26 +1309,26 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v12, 0 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s0 -; SDAG-NEXT: v_mov_b32_e32 v5, s1 -; SDAG-NEXT: v_mov_b32_e32 v6, s2 -; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s16 +; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v14, s8 +; SDAG-NEXT: v_mov_b32_e32 v15, s9 +; SDAG-NEXT: v_mov_b32_e32 v16, s10 +; SDAG-NEXT: v_mov_b32_e32 v17, s11 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s0 +; SDAG-NEXT: v_mov_b32_e32 v7, s1 +; SDAG-NEXT: v_mov_b32_e32 v8, s2 +; SDAG-NEXT: v_mov_b32_e32 v9, s3 +; SDAG-NEXT: v_mov_b32_e32 v1, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr: @@ -1339,20 +1340,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] +; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v16, s2 +; GISEL-NEXT: v_mov_b32_e32 v12, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1470,26 +1471,26 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v12, 0 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s0 -; SDAG-NEXT: v_mov_b32_e32 v5, s1 -; SDAG-NEXT: v_mov_b32_e32 v6, s2 -; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s16 +; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v14, s8 +; SDAG-NEXT: v_mov_b32_e32 v15, s9 +; SDAG-NEXT: v_mov_b32_e32 v16, s10 +; SDAG-NEXT: v_mov_b32_e32 v17, s11 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s0 +; SDAG-NEXT: v_mov_b32_e32 v7, s1 +; SDAG-NEXT: v_mov_b32_e32 v8, s2 +; SDAG-NEXT: v_mov_b32_e32 v9, s3 +; SDAG-NEXT: v_mov_b32_e32 v1, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr: @@ -1501,20 +1502,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] +; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v16, s2 +; GISEL-NEXT: v_mov_b32_e32 v12, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1632,26 +1633,26 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v12, 0 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s0 -; SDAG-NEXT: v_mov_b32_e32 v5, s1 -; SDAG-NEXT: v_mov_b32_e32 v6, s2 -; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s16 +; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v14, s8 +; SDAG-NEXT: v_mov_b32_e32 v15, s9 +; SDAG-NEXT: v_mov_b32_e32 v16, s10 +; SDAG-NEXT: v_mov_b32_e32 v17, s11 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s0 +; SDAG-NEXT: v_mov_b32_e32 v7, s1 +; SDAG-NEXT: v_mov_b32_e32 v8, s2 +; SDAG-NEXT: v_mov_b32_e32 v9, s3 +; SDAG-NEXT: v_mov_b32_e32 v1, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr: @@ -1663,20 +1664,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] +; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v16, s2 +; GISEL-NEXT: v_mov_b32_e32 v12, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1794,26 +1795,26 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v12, 0 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s0 -; SDAG-NEXT: v_mov_b32_e32 v5, s1 -; SDAG-NEXT: v_mov_b32_e32 v6, s2 -; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s16 +; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v14, s8 +; SDAG-NEXT: v_mov_b32_e32 v15, s9 +; SDAG-NEXT: v_mov_b32_e32 v16, s10 +; SDAG-NEXT: v_mov_b32_e32 v17, s11 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s0 +; SDAG-NEXT: v_mov_b32_e32 v7, s1 +; SDAG-NEXT: v_mov_b32_e32 v8, s2 +; SDAG-NEXT: v_mov_b32_e32 v9, s3 +; SDAG-NEXT: v_mov_b32_e32 v1, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr: @@ -1825,20 +1826,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] +; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v16, s2 +; GISEL-NEXT: v_mov_b32_e32 v12, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1960,22 +1961,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v24, s8 -; SDAG-NEXT: v_mov_b32_e32 v25, s9 -; SDAG-NEXT: v_mov_b32_e32 v26, s10 -; SDAG-NEXT: v_mov_b32_e32 v27, s11 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s0 -; SDAG-NEXT: v_mov_b32_e32 v21, s1 -; SDAG-NEXT: v_mov_b32_e32 v22, s2 -; SDAG-NEXT: v_mov_b32_e32 v23, s3 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v26, s8 +; SDAG-NEXT: v_mov_b32_e32 v27, s9 +; SDAG-NEXT: v_mov_b32_e32 v28, s10 +; SDAG-NEXT: v_mov_b32_e32 v29, s11 +; SDAG-NEXT: v_mov_b32_e32 v18, s12 +; SDAG-NEXT: v_mov_b32_e32 v19, s13 +; SDAG-NEXT: v_mov_b32_e32 v20, s14 +; SDAG-NEXT: v_mov_b32_e32 v21, s15 +; SDAG-NEXT: v_mov_b32_e32 v22, s0 +; SDAG-NEXT: v_mov_b32_e32 v23, s1 +; SDAG-NEXT: v_mov_b32_e32 v24, s2 +; SDAG-NEXT: v_mov_b32_e32 v25, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 @@ -2261,7 +2262,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v18, v7 ; GISEL-NEXT: v_mov_b32_e32 v19, v8 ; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v21, v10 ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 @@ -2269,6 +2269,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v3, s27 ; GISEL-NEXT: v_mov_b32_e32 v4, s28 ; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] @@ -2309,22 +2310,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v24, s8 -; SDAG-NEXT: v_mov_b32_e32 v25, s9 -; SDAG-NEXT: v_mov_b32_e32 v26, s10 -; SDAG-NEXT: v_mov_b32_e32 v27, s11 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s0 -; SDAG-NEXT: v_mov_b32_e32 v21, s1 -; SDAG-NEXT: v_mov_b32_e32 v22, s2 -; SDAG-NEXT: v_mov_b32_e32 v23, s3 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v26, s8 +; SDAG-NEXT: v_mov_b32_e32 v27, s9 +; SDAG-NEXT: v_mov_b32_e32 v28, s10 +; SDAG-NEXT: v_mov_b32_e32 v29, s11 +; SDAG-NEXT: v_mov_b32_e32 v18, s12 +; SDAG-NEXT: v_mov_b32_e32 v19, s13 +; SDAG-NEXT: v_mov_b32_e32 v20, s14 +; SDAG-NEXT: v_mov_b32_e32 v21, s15 +; SDAG-NEXT: v_mov_b32_e32 v22, s0 +; SDAG-NEXT: v_mov_b32_e32 v23, s1 +; SDAG-NEXT: v_mov_b32_e32 v24, s2 +; SDAG-NEXT: v_mov_b32_e32 v25, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 @@ -2610,7 +2611,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v18, v7 ; GISEL-NEXT: v_mov_b32_e32 v19, v8 ; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v21, v10 ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 @@ -2618,6 +2618,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v3, s27 ; GISEL-NEXT: v_mov_b32_e32 v4, s28 ; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] @@ -2658,22 +2659,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v24, s8 -; SDAG-NEXT: v_mov_b32_e32 v25, s9 -; SDAG-NEXT: v_mov_b32_e32 v26, s10 -; SDAG-NEXT: v_mov_b32_e32 v27, s11 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s0 -; SDAG-NEXT: v_mov_b32_e32 v21, s1 -; SDAG-NEXT: v_mov_b32_e32 v22, s2 -; SDAG-NEXT: v_mov_b32_e32 v23, s3 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v26, s8 +; SDAG-NEXT: v_mov_b32_e32 v27, s9 +; SDAG-NEXT: v_mov_b32_e32 v28, s10 +; SDAG-NEXT: v_mov_b32_e32 v29, s11 +; SDAG-NEXT: v_mov_b32_e32 v18, s12 +; SDAG-NEXT: v_mov_b32_e32 v19, s13 +; SDAG-NEXT: v_mov_b32_e32 v20, s14 +; SDAG-NEXT: v_mov_b32_e32 v21, s15 +; SDAG-NEXT: v_mov_b32_e32 v22, s0 +; SDAG-NEXT: v_mov_b32_e32 v23, s1 +; SDAG-NEXT: v_mov_b32_e32 v24, s2 +; SDAG-NEXT: v_mov_b32_e32 v25, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 @@ -2959,7 +2960,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v18, v7 ; GISEL-NEXT: v_mov_b32_e32 v19, v8 ; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v21, v10 ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 @@ -2967,6 +2967,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v3, s27 ; GISEL-NEXT: v_mov_b32_e32 v4, s28 ; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] @@ -3007,22 +3008,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v24, s8 -; SDAG-NEXT: v_mov_b32_e32 v25, s9 -; SDAG-NEXT: v_mov_b32_e32 v26, s10 -; SDAG-NEXT: v_mov_b32_e32 v27, s11 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s0 -; SDAG-NEXT: v_mov_b32_e32 v21, s1 -; SDAG-NEXT: v_mov_b32_e32 v22, s2 -; SDAG-NEXT: v_mov_b32_e32 v23, s3 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v26, s8 +; SDAG-NEXT: v_mov_b32_e32 v27, s9 +; SDAG-NEXT: v_mov_b32_e32 v28, s10 +; SDAG-NEXT: v_mov_b32_e32 v29, s11 +; SDAG-NEXT: v_mov_b32_e32 v18, s12 +; SDAG-NEXT: v_mov_b32_e32 v19, s13 +; SDAG-NEXT: v_mov_b32_e32 v20, s14 +; SDAG-NEXT: v_mov_b32_e32 v21, s15 +; SDAG-NEXT: v_mov_b32_e32 v22, s0 +; SDAG-NEXT: v_mov_b32_e32 v23, s1 +; SDAG-NEXT: v_mov_b32_e32 v24, s2 +; SDAG-NEXT: v_mov_b32_e32 v25, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 @@ -3308,7 +3309,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v18, v7 ; GISEL-NEXT: v_mov_b32_e32 v19, v8 ; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v21, v10 ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 @@ -3316,6 +3316,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v3, s27 ; GISEL-NEXT: v_mov_b32_e32 v4, s28 ; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] @@ -3336,4 +3337,4 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ret <16 x float> %result } -attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0,0" } +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll index 037e26087eaa5..a482af1e41afe 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll @@ -46,14 +46,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_splat(<2 x float> %A, <2 x ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -89,15 +89,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_inlineable(<2 x float> %A, ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -406,14 +406,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8 ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -449,15 +449,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_inlineable(<8 x i32> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -516,14 +516,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8 ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -559,15 +559,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_inlineable(<8 x i32> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -626,14 +626,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8 ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -669,15 +669,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_inlineable(<8 x i32> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -736,14 +736,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8 ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -779,15 +779,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_inlineable(<8 x i32> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -841,9 +841,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -870,11 +870,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_inlineable(<8 x i32> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_mov_b32 s1, s0 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -923,9 +923,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -952,11 +952,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_inlineable(<8 x i32> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_mov_b32 s1, s0 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -1005,9 +1005,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -1034,11 +1034,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_inlineable(<8 x i32> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_mov_b32 s1, s0 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -1087,9 +1087,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -1116,11 +1116,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_inlineable(<8 x i32> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_mov_b32 s1, s0 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -1174,14 +1174,14 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_splat(<8 x i32> %A, <8 x i ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1 ; GISEL-NEXT: s_mov_b32 s2, 2 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1217,15 +1217,15 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_inlineable(<8 x i32> %A, < ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_movk_i32 s0, 0x80 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1284,14 +1284,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_splat(<16 x half> %A, <16 ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1327,15 +1327,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_inlineable(<16 x half> %A, ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1389,9 +1389,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_splat(<16 x half> %A, <16 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -1418,11 +1418,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_inlineable(<16 x half> %A, ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_mov_b32 s1, s0 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -1476,14 +1476,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, < ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s1, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1519,15 +1519,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1586,14 +1586,14 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_splat(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s1, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1630,15 +1630,15 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable(<16 x ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: v_mov_b32_e32 v42, 0x64 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: v_mov_b32_e32 v43, 0x65 @@ -1698,14 +1698,14 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_splat(<16 x i3 ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s1, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1743,15 +1743,15 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable(<16 ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x64 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[44:45], 0x65 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] @@ -1806,9 +1806,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -1835,11 +1835,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_inlineable(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_mov_b32 s1, s0 -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -1888,9 +1888,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -1917,11 +1917,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_inlineable(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_mov_b32 s1, s0 -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -1970,9 +1970,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -1999,11 +1999,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_inlineable(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_mov_b32 s1, s0 -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -2052,9 +2052,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -2081,11 +2081,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_inlineable(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_mov_b32 s1, s0 -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -2139,14 +2139,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2182,15 +2182,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_inlineable(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2249,14 +2249,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2292,15 +2292,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_inlineable(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2359,14 +2359,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2402,15 +2402,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_inlineable(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2469,14 +2469,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2512,15 +2512,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_inlineable(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2590,8 +2590,6 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s14, s0 -; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 @@ -2604,8 +2602,10 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x ; GISEL-NEXT: s_mov_b32 s11, s0 ; GISEL-NEXT: s_mov_b32 s12, s0 ; GISEL-NEXT: s_mov_b32 s13, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] @@ -2654,8 +2654,6 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s14, s0 -; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 @@ -2669,8 +2667,10 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, ; GISEL-NEXT: s_mov_b32 s11, s0 ; GISEL-NEXT: s_mov_b32 s12, s0 ; GISEL-NEXT: s_mov_b32 s13, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] @@ -2746,8 +2746,6 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s1, 2.0 -; GISEL-NEXT: s_mov_b32 s14, s0 -; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 @@ -2760,8 +2758,10 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_mov_b32 s11, s0 ; GISEL-NEXT: s_mov_b32 s12, s0 ; GISEL-NEXT: s_mov_b32 s13, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] @@ -2811,8 +2811,6 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32 ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: v_mov_b32_e32 v42, 0x64 -; GISEL-NEXT: s_mov_b32 s14, s0 -; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 @@ -2826,8 +2824,10 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32 ; GISEL-NEXT: s_mov_b32 s11, s0 ; GISEL-NEXT: s_mov_b32 s12, s0 ; GISEL-NEXT: s_mov_b32 s13, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] @@ -2904,8 +2904,6 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_splat(<16 x i32> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s1, 2.0 -; GISEL-NEXT: s_mov_b32 s14, s0 -; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 @@ -2918,8 +2916,10 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_splat(<16 x i32> % ; GISEL-NEXT: s_mov_b32 s11, s0 ; GISEL-NEXT: s_mov_b32 s12, s0 ; GISEL-NEXT: s_mov_b32 s13, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] @@ -2970,8 +2970,6 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x64 -; GISEL-NEXT: s_mov_b32 s14, s0 -; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 @@ -2985,8 +2983,10 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i ; GISEL-NEXT: s_mov_b32 s11, s0 ; GISEL-NEXT: s_mov_b32 s12, s0 ; GISEL-NEXT: s_mov_b32 s13, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index 2752649550b69..5f08f5970a0e6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -534,8 +534,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -590,8 +590,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -656,14 +656,14 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX802-SDAG-NEXT: flat_load_dword v0, v[0:1] ; GFX802-SDAG-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX802-SDAG-NEXT: s_nop 2 +; GFX802-SDAG-NEXT: s_nop 3 ; GFX802-SDAG-NEXT: v_writelane_b32 v2, 12, s2 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2 ; GFX802-SDAG-NEXT: s_endpgm ; @@ -719,14 +719,14 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX802-GISEL-NEXT: flat_load_dword v0, v[0:1] ; GFX802-GISEL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX802-GISEL-NEXT: v_readfirstlane_b32 s2, v0 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX802-GISEL-NEXT: s_nop 2 +; GFX802-GISEL-NEXT: s_nop 3 ; GFX802-GISEL-NEXT: v_writelane_b32 v2, 12, s2 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX802-GISEL-NEXT: s_endpgm ; @@ -791,13 +791,13 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1] ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: v_readfirstlane_b32 s2, v2 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; GFX802-SDAG-NEXT: s_nop 2 +; GFX802-SDAG-NEXT: s_nop 3 ; GFX802-SDAG-NEXT: v_writelane_b32 v1, 0, s2 ; GFX802-SDAG-NEXT: v_writelane_b32 v0, 12, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 @@ -937,14 +937,14 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1] ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: v_readfirstlane_b32 s2, v2 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s2 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; GFX802-SDAG-NEXT: s_nop 1 +; GFX802-SDAG-NEXT: s_nop 2 ; GFX802-SDAG-NEXT: v_writelane_b32 v0, 0, s2 ; GFX802-SDAG-NEXT: v_writelane_b32 v1, s4, m0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 @@ -1087,9 +1087,9 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s2 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3 ; GFX802-SDAG-NEXT: v_writelane_b32 v2, s4, m0 @@ -1143,9 +1143,9 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s2 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0 @@ -1496,8 +1496,8 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 %oldval, ptr addrs ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s4 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, m0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2 ; GFX802-SDAG-NEXT: s_endpgm @@ -1536,8 +1536,8 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 %oldval, ptr addrs ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX802-GISEL-NEXT: s_endpgm @@ -1631,10 +1631,10 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 %oldval, ptr addrs ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3 ; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX802-GISEL-NEXT: s_endpgm @@ -1734,10 +1734,10 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double %oldval, ptr ad ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3 ; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX802-GISEL-NEXT: s_endpgm @@ -1786,8 +1786,8 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, m0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2 ; GFX802-SDAG-NEXT: s_endpgm @@ -1821,8 +1821,8 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX802-GISEL-NEXT: s_endpgm @@ -1907,11 +1907,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX802-GISEL-NEXT: s_endpgm @@ -2004,11 +2004,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX802-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 3897a0e028334..ccf85a0d8e45e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -382,8 +382,8 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm @@ -434,8 +434,8 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v4 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -916,8 +916,8 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v5 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s5 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm @@ -988,8 +988,8 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v4 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5 ; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-GISEL-NEXT: s_endpgm @@ -1674,8 +1674,8 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm @@ -1765,8 +1765,8 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v5 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 ; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index 574b1c0b4974c..7c5d38c73dd2c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -384,8 +384,8 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm @@ -436,8 +436,8 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v4 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -918,8 +918,8 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v5 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s5 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm @@ -990,8 +990,8 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v4 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5 ; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-GISEL-NEXT: s_endpgm @@ -1676,8 +1676,8 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm @@ -1767,8 +1767,8 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v5 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 ; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index dd44a1a35067e..27aeae985d1e8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -61,9 +61,9 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec ; VI-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: v_ldexp_f32 v2, v0, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 ; VI-SDAG-NEXT: s_endpgm ; @@ -240,8 +240,8 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-NEXT: v_ldexp_f32 v1, v2, s3 ; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec ; VI-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s2 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm @@ -465,18 +465,18 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc ; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v3, s1, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_exp_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v3, s1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_exp_f32_e32 v3, v3 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_cselect_b32 s4, 0xffffffc0, 0 ; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; VI-SDAG-NEXT: v_ldexp_f32 v1, v3, s4 ; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 +; VI-SDAG-NEXT: v_ldexp_f32 v1, v3, s4 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3 -; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm @@ -765,17 +765,17 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc ; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v4, s1, v4 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v4, s1, v4 ; VI-SDAG-NEXT: v_add_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0 ; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; VI-SDAG-NEXT: v_ldexp_f32 v1, v4, s2 ; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 +; VI-SDAG-NEXT: v_ldexp_f32 v1, v4, s2 ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll index 956145fb24c4a..ee53332f2f786 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll @@ -36,8 +36,8 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; GFX8CHECK-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX8CHECK-NEXT: s_cmpk_gt_i32 s2, 0x7f80 ; GFX8CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0 ; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] +; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0 ; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1 ; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2 ; GFX8CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index dd19ba17bb292..edc200d757737 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -51,8 +51,8 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX8SELDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8SELDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX8SELDAG-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3 -; GFX8SELDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX8SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] +; GFX8SELDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX8SELDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX8SELDAG-NEXT: flat_store_dword v[0:1], v2 ; GFX8SELDAG-NEXT: s_endpgm @@ -68,8 +68,8 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX8GLISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX8GLISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8GLISEL-NEXT: s_cselect_b32 s2, -1, 0 -; GFX8GLISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GLISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GLISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GLISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GLISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GLISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll index 0a9fe10874c38..3d42c0bdf5dfb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll @@ -49,8 +49,8 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; GFX8SELDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8SELDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX8SELDAG-NEXT: v_cmp_class_f32_e64 s[2:3], s2, 3 -; GFX8SELDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX8SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] +; GFX8SELDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX8SELDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX8SELDAG-NEXT: flat_store_dword v[0:1], v2 ; GFX8SELDAG-NEXT: s_endpgm @@ -66,8 +66,8 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; GFX8GLISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX8GLISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8GLISEL-NEXT: s_cselect_b32 s2, -1, 0 -; GFX8GLISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GLISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GLISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GLISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GLISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GLISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index e24fd1f22bfa6..6e8e3a9baef0e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -285,12 +285,12 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-NEXT: s_cselect_b32 s3, 32, 0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; VI-SDAG-NEXT: v_log_f32_e32 v4, v1 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v3, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm @@ -580,14 +580,14 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: s_cselect_b32 s1, 32, 0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; VI-SDAG-NEXT: v_log_f32_e32 v5, v5 -; VI-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v6, v1 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v3, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v5, v4 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v6, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s5 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm @@ -967,16 +967,16 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-SDAG-NEXT: s_cselect_b32 s1, 32, 0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; VI-SDAG-NEXT: v_log_f32_e32 v5, v5 -; VI-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v7, v7 ; VI-SDAG-NEXT: v_log_f32_e32 v8, v1 ; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v5, v4 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v7, v6 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v8, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index 76b97e843d777..37250ca94f42a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -19,8 +19,8 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; SI-NEXT: s_cselect_b32 s4, 0, s4 ; SI-NEXT: s_cselect_b32 s5, s8, s5 ; SI-NEXT: s_cmp_gt_i32 s7, 51 -; SI-NEXT: s_cselect_b32 s8, s2, s4 ; SI-NEXT: s_cselect_b32 s9, s3, s5 +; SI-NEXT: s_cselect_b32 s8, s2, s4 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_f64 v[0:1], s[2:3], -v[0:1] @@ -150,8 +150,8 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) ; SI-NEXT: s_cselect_b32 s0, 0, s0 ; SI-NEXT: s_cselect_b32 s1, s3, s1 ; SI-NEXT: s_cmp_gt_i32 s12, 51 -; SI-NEXT: s_cselect_b32 s12, s10, s0 ; SI-NEXT: s_cselect_b32 s13, s11, s1 +; SI-NEXT: s_cselect_b32 s12, s10, s0 ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] @@ -170,8 +170,8 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) ; SI-NEXT: s_cselect_b32 s4, 0, s4 ; SI-NEXT: s_cselect_b32 s5, s6, s5 ; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s4, s8, s4 ; SI-NEXT: s_cselect_b32 s5, s9, s5 +; SI-NEXT: s_cselect_b32 s4, s8, s4 ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: v_add_f64 v[2:3], s[8:9], -v[2:3] @@ -243,8 +243,8 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; SI-NEXT: s_cselect_b32 s0, 0, s0 ; SI-NEXT: s_cselect_b32 s1, s3, s1 ; SI-NEXT: s_cmp_gt_i32 s16, 51 -; SI-NEXT: s_cselect_b32 s16, s10, s0 ; SI-NEXT: s_cselect_b32 s17, s11, s1 +; SI-NEXT: s_cselect_b32 s16, s10, s0 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] @@ -264,9 +264,9 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; SI-NEXT: s_cselect_b32 s5, s10, s5 ; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_brev_b32 s18, -2 +; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: s_cselect_b32 s4, s8, s4 ; SI-NEXT: v_bfi_b32 v5, s18, v0, v1 -; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_add_f64 v[0:1], s[8:9], -v[0:1] @@ -285,8 +285,8 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; SI-NEXT: s_cselect_b32 s8, 0, s8 ; SI-NEXT: s_cselect_b32 s9, s10, s9 ; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s8, s14, s8 ; SI-NEXT: s_cselect_b32 s9, s15, s9 +; SI-NEXT: s_cselect_b32 s8, s14, s8 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_f64 v[0:1], s[14:15], -v[0:1] @@ -397,8 +397,8 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: s_cselect_b32 s0, 0, s0 ; SI-NEXT: s_cselect_b32 s1, s3, s1 ; SI-NEXT: s_cmp_gt_i32 s24, 51 -; SI-NEXT: s_cselect_b32 s24, s10, s0 ; SI-NEXT: s_cselect_b32 s25, s11, s1 +; SI-NEXT: s_cselect_b32 s24, s10, s0 ; SI-NEXT: v_mov_b32_e32 v0, s24 ; SI-NEXT: v_mov_b32_e32 v1, s25 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] @@ -418,9 +418,9 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: s_cselect_b32 s5, s11, s5 ; SI-NEXT: s_cmp_gt_i32 s10, 51 ; SI-NEXT: s_brev_b32 s3, -2 +; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: s_cselect_b32 s4, s8, s4 ; SI-NEXT: v_bfi_b32 v9, s3, v0, v1 -; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_add_f64 v[0:1], s[8:9], -v[0:1] @@ -439,8 +439,8 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: s_cselect_b32 s8, 0, s8 ; SI-NEXT: s_cselect_b32 s9, s11, s9 ; SI-NEXT: s_cmp_gt_i32 s10, 51 -; SI-NEXT: s_cselect_b32 s8, s14, s8 ; SI-NEXT: s_cselect_b32 s9, s15, s9 +; SI-NEXT: s_cselect_b32 s8, s14, s8 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_f64 v[0:1], s[14:15], -v[0:1] @@ -459,8 +459,8 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: s_cselect_b32 s4, 0, s4 ; SI-NEXT: s_cselect_b32 s5, s11, s5 ; SI-NEXT: s_cmp_gt_i32 s10, 51 -; SI-NEXT: s_cselect_b32 s4, s12, s4 ; SI-NEXT: s_cselect_b32 s5, s13, s5 +; SI-NEXT: s_cselect_b32 s4, s12, s4 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_mov_b32_e32 v5, s5 ; SI-NEXT: v_add_f64 v[4:5], s[12:13], -v[4:5] @@ -480,8 +480,8 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: s_cselect_b32 s8, 0, s8 ; SI-NEXT: s_cselect_b32 s9, s11, s9 ; SI-NEXT: s_cmp_gt_i32 s10, 51 -; SI-NEXT: s_cselect_b32 s8, s18, s8 ; SI-NEXT: s_cselect_b32 s9, s19, s9 +; SI-NEXT: s_cselect_b32 s8, s18, s8 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_add_f64 v[4:5], s[18:19], -v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll index 001d7487b51b4..cb1d68936a1cf 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll @@ -53,9 +53,9 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %ld = load double, ptr addrspace(4) %in diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 81e407de9c324..082d0d5957bf3 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -616,9 +616,9 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm ; @@ -1551,8 +1551,8 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: v_mov_b32_e32 v8, s0 +; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10003 @@ -1562,13 +1562,13 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; GFX8-NEXT: s_bfe_u32 s7, s2, 0x10002 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x10004 ; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 7, v0 ; GFX8-NEXT: v_bfe_u32 v2, v0, 6, 1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v11, s1 ; GFX8-NEXT: v_mov_b32_e32 v10, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 @@ -1705,8 +1705,8 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v11, s3 -; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: v_mov_b32_e32 v10, s2 +; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: v_mov_b32_e32 v8, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v3, v4, 3, 1 @@ -1858,8 +1858,8 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: v_mov_b32_e32 v16, s0 +; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s2 @@ -1883,28 +1883,28 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v18, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v19, s3 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: v_mov_b32_e32 v18, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_add_u32 s0, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: v_mov_b32_e32 v6, s10 ; GFX8-NEXT: v_mov_b32_e32 v4, s17 ; GFX8-NEXT: v_mov_b32_e32 v7, s12 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v11, s5 ; GFX8-NEXT: v_mov_b32_e32 v8, s16 ; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: v_mov_b32_e32 v10, s15 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v12, s9 ; GFX8-NEXT: v_mov_b32_e32 v13, s4 @@ -2099,16 +2099,16 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX8-NEXT: flat_load_ushort v18, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, s3 ; GFX8-NEXT: v_mov_b32_e32 v8, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_mov_b32_e32 v13, s1 +; GFX8-NEXT: v_mov_b32_e32 v9, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v12, s0 ; GFX8-NEXT: s_add_u32 s0, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v13, s1 ; GFX8-NEXT: v_mov_b32_e32 v15, s3 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v14, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: v_mov_b32_e32 v16, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2417,56 +2417,56 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v0, s33 ; GFX8-NEXT: v_mov_b32_e32 v1, s16 ; GFX8-NEXT: v_mov_b32_e32 v2, s31 ; GFX8-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v0, s30 ; GFX8-NEXT: v_mov_b32_e32 v1, s29 ; GFX8-NEXT: v_mov_b32_e32 v2, s28 ; GFX8-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v0, s27 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mov_b32_e32 v2, s26 ; GFX8-NEXT: v_mov_b32_e32 v3, s12 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v0, s25 ; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: v_mov_b32_e32 v2, s24 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s23 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: v_mov_b32_e32 v2, s22 ; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s3 @@ -2856,56 +2856,56 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v1, s34 ; GFX8-NEXT: v_mov_b32_e32 v2, s33 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v0, s31 ; GFX8-NEXT: v_mov_b32_e32 v1, s30 ; GFX8-NEXT: v_mov_b32_e32 v2, s29 ; GFX8-NEXT: v_mov_b32_e32 v3, s28 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v0, s27 ; GFX8-NEXT: v_mov_b32_e32 v1, s26 ; GFX8-NEXT: v_mov_b32_e32 v2, s25 ; GFX8-NEXT: v_mov_b32_e32 v3, s24 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v0, s23 ; GFX8-NEXT: v_mov_b32_e32 v1, s22 ; GFX8-NEXT: v_mov_b32_e32 v2, s21 ; GFX8-NEXT: v_mov_b32_e32 v3, s20 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v0, s19 ; GFX8-NEXT: v_mov_b32_e32 v1, s18 ; GFX8-NEXT: v_mov_b32_e32 v2, s17 ; GFX8-NEXT: v_mov_b32_e32 v3, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s15 ; GFX8-NEXT: v_mov_b32_e32 v1, s14 ; GFX8-NEXT: v_mov_b32_e32 v2, s13 ; GFX8-NEXT: v_mov_b32_e32 v3, s12 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s3 @@ -3437,93 +3437,93 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v0, s27 ; GFX8-NEXT: s_addc_u32 s27, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xe0 ; GFX8-NEXT: v_mov_b32_e32 v1, s44 ; GFX8-NEXT: v_mov_b32_e32 v3, s43 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xe0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xd0 ; GFX8-NEXT: v_mov_b32_e32 v0, s66 ; GFX8-NEXT: v_mov_b32_e32 v1, s42 ; GFX8-NEXT: v_mov_b32_e32 v2, s65 ; GFX8-NEXT: v_mov_b32_e32 v3, s41 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xd0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xc0 ; GFX8-NEXT: v_mov_b32_e32 v0, s64 ; GFX8-NEXT: v_mov_b32_e32 v1, s63 ; GFX8-NEXT: v_mov_b32_e32 v2, s62 ; GFX8-NEXT: v_mov_b32_e32 v3, s40 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xc0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xb0 ; GFX8-NEXT: v_mov_b32_e32 v0, s61 ; GFX8-NEXT: v_mov_b32_e32 v1, s39 ; GFX8-NEXT: v_mov_b32_e32 v2, s60 ; GFX8-NEXT: v_mov_b32_e32 v3, s38 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xb0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xa0 ; GFX8-NEXT: v_mov_b32_e32 v0, s59 ; GFX8-NEXT: v_mov_b32_e32 v1, s37 ; GFX8-NEXT: v_mov_b32_e32 v2, s58 ; GFX8-NEXT: v_mov_b32_e32 v3, s36 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xa0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0x90 ; GFX8-NEXT: v_mov_b32_e32 v0, s57 ; GFX8-NEXT: v_mov_b32_e32 v1, s35 ; GFX8-NEXT: v_mov_b32_e32 v2, s56 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0x90 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0x80 ; GFX8-NEXT: v_mov_b32_e32 v0, s55 ; GFX8-NEXT: v_mov_b32_e32 v1, s33 ; GFX8-NEXT: v_mov_b32_e32 v2, s54 ; GFX8-NEXT: v_mov_b32_e32 v3, s31 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0x80 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v0, s52 ; GFX8-NEXT: v_mov_b32_e32 v1, s30 ; GFX8-NEXT: v_mov_b32_e32 v2, s53 ; GFX8-NEXT: v_mov_b32_e32 v3, s29 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0x70 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v0, s51 ; GFX8-NEXT: v_mov_b32_e32 v1, s28 ; GFX8-NEXT: v_mov_b32_e32 v2, s50 ; GFX8-NEXT: v_mov_b32_e32 v3, s25 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0x60 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v0, s49 ; GFX8-NEXT: v_mov_b32_e32 v1, s23 ; GFX8-NEXT: v_mov_b32_e32 v2, s48 ; GFX8-NEXT: v_mov_b32_e32 v3, s21 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0x50 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 ; GFX8-NEXT: v_mov_b32_e32 v0, s47 ; GFX8-NEXT: v_mov_b32_e32 v1, s46 @@ -3535,30 +3535,30 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v2, s22 ; GFX8-NEXT: s_add_u32 s22, s0, 64 ; GFX8-NEXT: s_addc_u32 s23, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NEXT: v_mov_b32_e32 v5, s23 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: s_add_u32 s18, s0, 48 ; GFX8-NEXT: s_addc_u32 s19, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NEXT: v_mov_b32_e32 v5, s19 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NEXT: s_add_u32 s14, s0, 32 ; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NEXT: v_mov_b32_e32 v5, s15 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -3566,9 +3566,9 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s4, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 @@ -4274,84 +4274,84 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v0, s27 ; GFX8-NEXT: s_addc_u32 s27, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xe0 ; GFX8-NEXT: v_mov_b32_e32 v1, s66 ; GFX8-NEXT: v_mov_b32_e32 v2, s65 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xe0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xd0 ; GFX8-NEXT: v_mov_b32_e32 v0, s64 ; GFX8-NEXT: v_mov_b32_e32 v1, s63 ; GFX8-NEXT: v_mov_b32_e32 v2, s62 ; GFX8-NEXT: v_mov_b32_e32 v3, s61 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xd0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xc0 ; GFX8-NEXT: v_mov_b32_e32 v0, s60 ; GFX8-NEXT: v_mov_b32_e32 v1, s59 ; GFX8-NEXT: v_mov_b32_e32 v2, s58 ; GFX8-NEXT: v_mov_b32_e32 v3, s57 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xc0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xb0 ; GFX8-NEXT: v_mov_b32_e32 v0, s56 ; GFX8-NEXT: v_mov_b32_e32 v1, s55 ; GFX8-NEXT: v_mov_b32_e32 v2, s54 ; GFX8-NEXT: v_mov_b32_e32 v3, s53 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xb0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xa0 ; GFX8-NEXT: v_mov_b32_e32 v0, s52 ; GFX8-NEXT: v_mov_b32_e32 v1, s51 ; GFX8-NEXT: v_mov_b32_e32 v2, s50 ; GFX8-NEXT: v_mov_b32_e32 v3, s49 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xa0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0x90 ; GFX8-NEXT: v_mov_b32_e32 v0, s48 ; GFX8-NEXT: v_mov_b32_e32 v1, s47 ; GFX8-NEXT: v_mov_b32_e32 v2, s46 ; GFX8-NEXT: v_mov_b32_e32 v3, s45 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0x90 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0x80 ; GFX8-NEXT: v_mov_b32_e32 v0, s44 ; GFX8-NEXT: v_mov_b32_e32 v1, s43 ; GFX8-NEXT: v_mov_b32_e32 v2, s42 ; GFX8-NEXT: v_mov_b32_e32 v3, s41 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0x80 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v0, s40 ; GFX8-NEXT: v_mov_b32_e32 v1, s39 ; GFX8-NEXT: v_mov_b32_e32 v2, s38 ; GFX8-NEXT: v_mov_b32_e32 v3, s37 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0x70 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v0, s36 ; GFX8-NEXT: v_mov_b32_e32 v1, s35 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s33 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0x60 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 ; GFX8-NEXT: v_mov_b32_e32 v0, s31 ; GFX8-NEXT: v_mov_b32_e32 v1, s30 @@ -4364,9 +4364,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s22, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v2, s23 ; GFX8-NEXT: s_addc_u32 s23, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NEXT: v_mov_b32_e32 v0, s25 ; GFX8-NEXT: v_mov_b32_e32 v1, s24 +; GFX8-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NEXT: v_mov_b32_e32 v5, s23 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -4374,9 +4374,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s18, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v2, s19 ; GFX8-NEXT: s_addc_u32 s19, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NEXT: v_mov_b32_e32 v0, s21 ; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NEXT: v_mov_b32_e32 v5, s19 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -4384,9 +4384,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s14, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v2, s15 ; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NEXT: v_mov_b32_e32 v0, s17 ; GFX8-NEXT: v_mov_b32_e32 v1, s16 +; GFX8-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NEXT: v_mov_b32_e32 v5, s15 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -4394,9 +4394,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s10, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v2, s11 ; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NEXT: v_mov_b32_e32 v0, s13 ; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -4404,9 +4404,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s6, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: v_mov_b32_e32 v0, s9 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 @@ -5072,8 +5072,9 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm ; @@ -5238,8 +5239,9 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm ; @@ -5500,9 +5502,9 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s1 ; GFX8-NEXT: v_mov_b32_e32 v8, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v7, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v0, 1, v4 @@ -5624,8 +5626,8 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 2, v0 @@ -5756,14 +5758,14 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, s3 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, s1 +; GFX8-NEXT: v_mov_b32_e32 v11, s3 ; GFX8-NEXT: v_mov_b32_e32 v10, s2 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: v_mov_b32_e32 v8, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v4, 1, v0 @@ -5903,8 +5905,8 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v11, s3 -; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: v_mov_b32_e32 v10, s2 +; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: v_mov_b32_e32 v8, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 2, v0 @@ -6057,11 +6059,11 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: s_add_u32 s4, s0, 32 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v16, s5 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v16, s5 ; GFX8-NEXT: v_mov_b32_e32 v15, s4 ; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v10, v1 @@ -6072,16 +6074,16 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX8-NEXT: v_bfe_u32 v6, v0, 5, 1 ; GFX8-NEXT: v_bfe_u32 v4, v0, 4, 1 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[4:7] -; GFX8-NEXT: v_mov_b32_e32 v16, s3 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_bfe_u32 v9, v0, 3, 1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: s_add_u32 s0, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v18, s1 ; GFX8-NEXT: v_mov_b32_e32 v17, s0 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX8-NEXT: v_bfe_u32 v9, v0, 3, 1 ; GFX8-NEXT: v_bfe_u32 v7, v0, 2, 1 +; GFX8-NEXT: v_mov_b32_e32 v16, s3 ; GFX8-NEXT: v_mov_b32_e32 v15, s2 ; GFX8-NEXT: v_bfe_u32 v13, v0, 1, 1 ; GFX8-NEXT: v_and_b32_e32 v11, 1, v0 @@ -6253,8 +6255,8 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: v_mov_b32_e32 v16, s0 +; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s3, v0 ; GFX8-NEXT: s_lshr_b32 s2, s3, 6 @@ -6276,28 +6278,28 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, s3 ; GFX8-NEXT: v_mov_b32_e32 v18, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 +; GFX8-NEXT: v_mov_b32_e32 v19, s3 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s4 ; GFX8-NEXT: v_mov_b32_e32 v7, s5 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[4:7] ; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v8, s6 ; GFX8-NEXT: v_mov_b32_e32 v9, s7 ; GFX8-NEXT: v_mov_b32_e32 v10, s8 ; GFX8-NEXT: v_mov_b32_e32 v11, s9 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[4:7] ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[8:11] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v12, s10 ; GFX8-NEXT: v_mov_b32_e32 v13, s11 ; GFX8-NEXT: v_mov_b32_e32 v14, s12 ; GFX8-NEXT: v_mov_b32_e32 v15, s13 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[8:11] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s14 @@ -6542,43 +6544,43 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, s3 ; GFX8-NEXT: v_mov_b32_e32 v10, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 +; GFX8-NEXT: v_mov_b32_e32 v11, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[2:5] -; GFX8-NEXT: v_mov_b32_e32 v11, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v10, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v11, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x60 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[6:9] +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[6:9] ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v2, s13 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v2, s11 ; GFX8-NEXT: v_mov_b32_e32 v4, v12 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 @@ -6862,8 +6864,8 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v19, s1 ; GFX8-NEXT: v_mov_b32_e32 v18, s0 +; GFX8-NEXT: v_mov_b32_e32 v19, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s3, v0 ; GFX8-NEXT: s_lshr_b32 s2, s3, 14 @@ -6901,64 +6903,64 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[2:5] -; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: v_mov_b32_e32 v7, s7 ; GFX8-NEXT: v_mov_b32_e32 v8, s8 ; GFX8-NEXT: v_mov_b32_e32 v9, s9 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[6:9] -; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v10, s10 ; GFX8-NEXT: v_mov_b32_e32 v11, s11 ; GFX8-NEXT: v_mov_b32_e32 v12, s12 ; GFX8-NEXT: v_mov_b32_e32 v13, s13 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NEXT: v_mov_b32_e32 v3, s15 ; GFX8-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NEXT: v_mov_b32_e32 v5, s17 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[2:5] ; GFX8-NEXT: v_mov_b32_e32 v6, s18 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v7, s19 ; GFX8-NEXT: v_mov_b32_e32 v8, s20 ; GFX8-NEXT: v_mov_b32_e32 v9, s21 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[6:9] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v10, s22 ; GFX8-NEXT: v_mov_b32_e32 v11, s23 ; GFX8-NEXT: v_mov_b32_e32 v12, s24 ; GFX8-NEXT: v_mov_b32_e32 v13, s25 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[6:9] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[10:13] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v14, s26 ; GFX8-NEXT: v_mov_b32_e32 v15, s27 ; GFX8-NEXT: v_mov_b32_e32 v16, s28 ; GFX8-NEXT: v_mov_b32_e32 v17, s29 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[10:13] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s30 @@ -7365,94 +7367,94 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NEXT: s_addc_u32 s7, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: s_add_u32 s6, s0, 0xe0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0xd0 ; GFX8-NEXT: v_mov_b32_e32 v0, s34 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0xd0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0xc0 ; GFX8-NEXT: v_mov_b32_e32 v0, s33 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0xc0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0xb0 ; GFX8-NEXT: v_mov_b32_e32 v0, s31 ; GFX8-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0xb0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0xa0 ; GFX8-NEXT: v_mov_b32_e32 v0, s30 ; GFX8-NEXT: v_mov_b32_e32 v2, s11 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0xa0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0x90 ; GFX8-NEXT: v_mov_b32_e32 v0, s28 ; GFX8-NEXT: v_mov_b32_e32 v2, s29 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0x90 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0x80 ; GFX8-NEXT: v_mov_b32_e32 v0, s27 ; GFX8-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0x80 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v0, s26 ; GFX8-NEXT: v_mov_b32_e32 v2, s13 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0x70 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v0, s25 ; GFX8-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0x60 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NEXT: v_mov_b32_e32 v2, s15 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0x50 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v0, s23 ; GFX8-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 64 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v0, s22 ; GFX8-NEXT: v_mov_b32_e32 v2, s17 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 48 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v0, s21 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 32 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: v_mov_b32_e32 v2, s19 @@ -8071,48 +8073,48 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v1, s45 ; GFX8-NEXT: s_addc_u32 s45, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: s_add_u32 s44, s0, 0xe0 ; GFX8-NEXT: v_mov_b32_e32 v2, s46 ; GFX8-NEXT: v_mov_b32_e32 v3, s47 ; GFX8-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NEXT: s_add_u32 s44, s0, 0xe0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s45, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: s_add_u32 s44, s0, 0xd0 ; GFX8-NEXT: v_mov_b32_e32 v0, s48 ; GFX8-NEXT: v_mov_b32_e32 v1, s49 ; GFX8-NEXT: v_mov_b32_e32 v2, s50 ; GFX8-NEXT: v_mov_b32_e32 v3, s51 ; GFX8-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NEXT: s_add_u32 s44, s0, 0xd0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s45, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: s_add_u32 s44, s0, 0xc0 ; GFX8-NEXT: v_mov_b32_e32 v0, s52 ; GFX8-NEXT: v_mov_b32_e32 v1, s53 ; GFX8-NEXT: v_mov_b32_e32 v2, s54 ; GFX8-NEXT: v_mov_b32_e32 v3, s55 ; GFX8-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NEXT: s_add_u32 s44, s0, 0xc0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s45, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: s_add_u32 s44, s0, 0xb0 ; GFX8-NEXT: v_mov_b32_e32 v0, s56 ; GFX8-NEXT: v_mov_b32_e32 v1, s57 ; GFX8-NEXT: v_mov_b32_e32 v2, s58 ; GFX8-NEXT: v_mov_b32_e32 v3, s59 ; GFX8-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NEXT: s_add_u32 s44, s0, 0xb0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s45, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: s_add_u32 s44, s0, 0xa0 ; GFX8-NEXT: v_mov_b32_e32 v0, s60 ; GFX8-NEXT: v_mov_b32_e32 v1, s61 ; GFX8-NEXT: v_mov_b32_e32 v2, s62 ; GFX8-NEXT: v_mov_b32_e32 v3, s63 ; GFX8-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NEXT: s_add_u32 s44, s0, 0xa0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s45, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s44 ; GFX8-NEXT: v_mov_b32_e32 v0, s64 ; GFX8-NEXT: v_mov_b32_e32 v1, s65 @@ -8125,9 +8127,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s40, s0, 0x90 ; GFX8-NEXT: v_mov_b32_e32 v3, s41 ; GFX8-NEXT: s_addc_u32 s41, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s40 ; GFX8-NEXT: v_mov_b32_e32 v0, s42 ; GFX8-NEXT: v_mov_b32_e32 v1, s43 +; GFX8-NEXT: v_mov_b32_e32 v4, s40 ; GFX8-NEXT: v_mov_b32_e32 v5, s41 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -8135,9 +8137,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s36, s0, 0x80 ; GFX8-NEXT: v_mov_b32_e32 v3, s37 ; GFX8-NEXT: s_addc_u32 s37, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s36 ; GFX8-NEXT: v_mov_b32_e32 v0, s38 ; GFX8-NEXT: v_mov_b32_e32 v1, s39 +; GFX8-NEXT: v_mov_b32_e32 v4, s36 ; GFX8-NEXT: v_mov_b32_e32 v5, s37 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -8145,9 +8147,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s30, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v3, s31 ; GFX8-NEXT: s_addc_u32 s31, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s30 ; GFX8-NEXT: v_mov_b32_e32 v0, s34 ; GFX8-NEXT: v_mov_b32_e32 v1, s35 +; GFX8-NEXT: v_mov_b32_e32 v4, s30 ; GFX8-NEXT: v_mov_b32_e32 v5, s31 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -8155,9 +8157,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s26, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v3, s27 ; GFX8-NEXT: s_addc_u32 s27, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s26 ; GFX8-NEXT: v_mov_b32_e32 v0, s28 ; GFX8-NEXT: v_mov_b32_e32 v1, s29 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -8165,9 +8167,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s22, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v3, s23 ; GFX8-NEXT: s_addc_u32 s23, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NEXT: v_mov_b32_e32 v1, s25 +; GFX8-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NEXT: v_mov_b32_e32 v5, s23 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -8175,9 +8177,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s18, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v3, s19 ; GFX8-NEXT: s_addc_u32 s19, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: v_mov_b32_e32 v1, s21 +; GFX8-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NEXT: v_mov_b32_e32 v5, s19 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -8185,9 +8187,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s14, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v3, s15 ; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NEXT: v_mov_b32_e32 v5, s15 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -8195,9 +8197,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s10, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -8205,9 +8207,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s6, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 @@ -8956,112 +8958,110 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v0, s43 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s42 -; GFX8-NEXT: v_mov_b32_e32 v5, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x1f0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s43 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x1e0 ; GFX8-NEXT: v_mov_b32_e32 v0, s66 ; GFX8-NEXT: v_mov_b32_e32 v2, s44 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x1e0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x1d0 ; GFX8-NEXT: v_mov_b32_e32 v0, s65 ; GFX8-NEXT: v_mov_b32_e32 v2, s45 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x1d0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x1c0 ; GFX8-NEXT: v_mov_b32_e32 v0, s64 ; GFX8-NEXT: v_mov_b32_e32 v2, s46 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x1c0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x1b0 ; GFX8-NEXT: v_mov_b32_e32 v0, s63 ; GFX8-NEXT: v_mov_b32_e32 v2, s47 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x1b0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x190 ; GFX8-NEXT: v_mov_b32_e32 v0, s62 ; GFX8-NEXT: v_mov_b32_e32 v2, s48 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x190 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x180 ; GFX8-NEXT: v_mov_b32_e32 v0, s61 ; GFX8-NEXT: v_mov_b32_e32 v2, s49 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x180 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x170 ; GFX8-NEXT: v_mov_b32_e32 v0, s60 ; GFX8-NEXT: v_mov_b32_e32 v2, s50 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x170 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x160 ; GFX8-NEXT: v_mov_b32_e32 v0, s59 ; GFX8-NEXT: v_mov_b32_e32 v2, s51 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x160 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x150 ; GFX8-NEXT: v_mov_b32_e32 v0, s58 ; GFX8-NEXT: v_mov_b32_e32 v2, s52 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x150 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x140 ; GFX8-NEXT: v_mov_b32_e32 v0, s57 ; GFX8-NEXT: v_mov_b32_e32 v2, s53 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x140 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x130 ; GFX8-NEXT: v_mov_b32_e32 v0, s56 ; GFX8-NEXT: v_mov_b32_e32 v2, s40 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x130 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x120 ; GFX8-NEXT: v_mov_b32_e32 v0, s55 ; GFX8-NEXT: v_mov_b32_e32 v2, s38 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x120 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 ; GFX8-NEXT: v_mov_b32_e32 v0, s54 ; GFX8-NEXT: v_mov_b32_e32 v2, s37 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 ; GFX8-NEXT: s_add_u32 s40, s0, 0x110 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s40 ; GFX8-NEXT: v_mov_b32_e32 v0, s41 ; GFX8-NEXT: s_addc_u32 s41, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s40 ; GFX8-NEXT: v_mov_b32_e32 v2, s35 ; GFX8-NEXT: v_mov_b32_e32 v5, s41 ; GFX8-NEXT: s_add_u32 s38, s0, 0x100 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s38 ; GFX8-NEXT: v_mov_b32_e32 v0, s39 ; GFX8-NEXT: s_addc_u32 s39, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s38 ; GFX8-NEXT: v_mov_b32_e32 v2, s33 ; GFX8-NEXT: v_mov_b32_e32 v5, s39 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -9069,52 +9069,48 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v0, s36 ; GFX8-NEXT: s_add_u32 s36, s0, 0xf0 ; GFX8-NEXT: s_addc_u32 s37, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s36 ; GFX8-NEXT: v_mov_b32_e32 v2, s30 +; GFX8-NEXT: v_mov_b32_e32 v4, s36 ; GFX8-NEXT: v_mov_b32_e32 v5, s37 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s34 ; GFX8-NEXT: s_add_u32 s34, s0, 0xe0 ; GFX8-NEXT: s_addc_u32 s35, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s34 ; GFX8-NEXT: v_mov_b32_e32 v2, s28 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: s_add_u32 s30, s0, 0xd0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s30 ; GFX8-NEXT: v_mov_b32_e32 v0, s31 ; GFX8-NEXT: s_addc_u32 s31, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s30 ; GFX8-NEXT: v_mov_b32_e32 v2, s26 ; GFX8-NEXT: v_mov_b32_e32 v5, s31 ; GFX8-NEXT: s_add_u32 s28, s0, 0xc0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s28 ; GFX8-NEXT: v_mov_b32_e32 v0, s29 ; GFX8-NEXT: s_addc_u32 s29, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s28 ; GFX8-NEXT: v_mov_b32_e32 v2, s25 ; GFX8-NEXT: v_mov_b32_e32 v5, s29 ; GFX8-NEXT: s_add_u32 s26, s0, 0xb0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 ; GFX8-NEXT: v_mov_b32_e32 v0, s27 ; GFX8-NEXT: s_addc_u32 s27, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s26 ; GFX8-NEXT: v_mov_b32_e32 v2, s22 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 ; GFX8-NEXT: s_add_u32 s22, s0, 0xa0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NEXT: v_mov_b32_e32 v0, s23 ; GFX8-NEXT: s_addc_u32 s23, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s22 +; GFX8-NEXT: s_add_u32 s22, s0, 0x90 ; GFX8-NEXT: v_mov_b32_e32 v2, s24 ; GFX8-NEXT: v_mov_b32_e32 v5, s23 -; GFX8-NEXT: s_add_u32 s22, s0, 0x90 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s23, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NEXT: v_mov_b32_e32 v0, s21 ; GFX8-NEXT: v_mov_b32_e32 v2, s19 @@ -9124,32 +9120,32 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: s_add_u32 s20, s0, 0x80 ; GFX8-NEXT: s_addc_u32 s21, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NEXT: v_mov_b32_e32 v5, s21 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s18 ; GFX8-NEXT: s_add_u32 s18, s0, 0x70 ; GFX8-NEXT: s_addc_u32 s19, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NEXT: v_mov_b32_e32 v2, s15 +; GFX8-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NEXT: v_mov_b32_e32 v5, s19 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: s_add_u32 s16, s0, 0x60 ; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NEXT: v_mov_b32_e32 v5, s17 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NEXT: s_add_u32 s12, s0, 0x50 ; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -10400,70 +10396,70 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[6:7], s[2:3], 0x10000 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1f0 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v43, s3 ; GFX8-NEXT: v_mov_b32_e32 v42, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1e0 +; GFX8-NEXT: v_mov_b32_e32 v43, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v45, s3 ; GFX8-NEXT: v_mov_b32_e32 v44, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1d0 +; GFX8-NEXT: v_mov_b32_e32 v45, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v47, s3 ; GFX8-NEXT: v_mov_b32_e32 v46, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1c0 +; GFX8-NEXT: v_mov_b32_e32 v47, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v49, s3 ; GFX8-NEXT: v_mov_b32_e32 v48, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1b0 +; GFX8-NEXT: v_mov_b32_e32 v49, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v51, s3 ; GFX8-NEXT: v_mov_b32_e32 v50, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1a0 +; GFX8-NEXT: v_mov_b32_e32 v51, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v53, s3 ; GFX8-NEXT: v_mov_b32_e32 v52, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x190 +; GFX8-NEXT: v_mov_b32_e32 v53, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v55, s3 ; GFX8-NEXT: v_mov_b32_e32 v54, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x180 +; GFX8-NEXT: v_mov_b32_e32 v55, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v57, s3 ; GFX8-NEXT: v_mov_b32_e32 v56, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x170 +; GFX8-NEXT: v_mov_b32_e32 v57, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v59, s3 ; GFX8-NEXT: v_mov_b32_e32 v58, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x160 +; GFX8-NEXT: v_mov_b32_e32 v59, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v61, s3 ; GFX8-NEXT: v_mov_b32_e32 v60, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x150 +; GFX8-NEXT: v_mov_b32_e32 v61, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[12:15] ; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11] ; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19] -; GFX8-NEXT: v_mov_b32_e32 v13, s3 ; GFX8-NEXT: v_mov_b32_e32 v12, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x140 +; GFX8-NEXT: v_mov_b32_e32 v13, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x130 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v17, s3 ; GFX8-NEXT: v_mov_b32_e32 v16, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x120 +; GFX8-NEXT: v_mov_b32_e32 v17, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, s3 ; GFX8-NEXT: v_mov_b32_e32 v18, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x110 ; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v19, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v42, vcc_lo ; GFX8-NEXT: v_mov_b32_e32 v43, vcc_hi +; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: v_mov_b32_e32 v6, s4 ; GFX8-NEXT: v_mov_b32_e32 v7, s5 ; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3] @@ -10482,159 +10478,159 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x100 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xf0 ; GFX8-NEXT: v_mov_b32_e32 v0, s22 ; GFX8-NEXT: v_mov_b32_e32 v1, s23 ; GFX8-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NEXT: v_mov_b32_e32 v3, s21 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 ; GFX8-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NEXT: v_mov_b32_e32 v1, s25 ; GFX8-NEXT: v_mov_b32_e32 v2, s26 ; GFX8-NEXT: v_mov_b32_e32 v3, s27 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xd0 ; GFX8-NEXT: v_mov_b32_e32 v0, s28 ; GFX8-NEXT: v_mov_b32_e32 v1, s29 ; GFX8-NEXT: v_mov_b32_e32 v2, s86 ; GFX8-NEXT: v_mov_b32_e32 v3, s87 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xc0 ; GFX8-NEXT: v_mov_b32_e32 v0, s84 ; GFX8-NEXT: v_mov_b32_e32 v1, s85 ; GFX8-NEXT: v_mov_b32_e32 v2, s82 ; GFX8-NEXT: v_mov_b32_e32 v3, s83 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xb0 ; GFX8-NEXT: v_mov_b32_e32 v0, s80 ; GFX8-NEXT: v_mov_b32_e32 v1, s81 ; GFX8-NEXT: v_mov_b32_e32 v2, s78 ; GFX8-NEXT: v_mov_b32_e32 v3, s79 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xa0 ; GFX8-NEXT: v_mov_b32_e32 v0, s76 ; GFX8-NEXT: v_mov_b32_e32 v1, s77 ; GFX8-NEXT: v_mov_b32_e32 v2, s74 ; GFX8-NEXT: v_mov_b32_e32 v3, s75 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x90 ; GFX8-NEXT: v_mov_b32_e32 v0, s72 ; GFX8-NEXT: v_mov_b32_e32 v1, s73 ; GFX8-NEXT: v_mov_b32_e32 v2, s70 ; GFX8-NEXT: v_mov_b32_e32 v3, s71 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x80 ; GFX8-NEXT: v_mov_b32_e32 v0, s68 ; GFX8-NEXT: v_mov_b32_e32 v1, s69 ; GFX8-NEXT: v_mov_b32_e32 v2, s66 ; GFX8-NEXT: v_mov_b32_e32 v3, s67 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v0, s64 ; GFX8-NEXT: v_mov_b32_e32 v1, s65 ; GFX8-NEXT: v_mov_b32_e32 v2, s62 ; GFX8-NEXT: v_mov_b32_e32 v3, s63 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v0, s60 ; GFX8-NEXT: v_mov_b32_e32 v1, s61 ; GFX8-NEXT: v_mov_b32_e32 v2, s58 ; GFX8-NEXT: v_mov_b32_e32 v3, s59 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v0, s56 ; GFX8-NEXT: v_mov_b32_e32 v1, s57 ; GFX8-NEXT: v_mov_b32_e32 v2, s54 ; GFX8-NEXT: v_mov_b32_e32 v3, s55 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v0, s52 ; GFX8-NEXT: v_mov_b32_e32 v1, s53 ; GFX8-NEXT: v_mov_b32_e32 v2, s50 ; GFX8-NEXT: v_mov_b32_e32 v3, s51 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v0, s48 ; GFX8-NEXT: v_mov_b32_e32 v1, s49 ; GFX8-NEXT: v_mov_b32_e32 v2, s46 ; GFX8-NEXT: v_mov_b32_e32 v3, s47 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v0, s44 ; GFX8-NEXT: v_mov_b32_e32 v1, s45 ; GFX8-NEXT: v_mov_b32_e32 v2, s42 ; GFX8-NEXT: v_mov_b32_e32 v3, s43 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s40 ; GFX8-NEXT: v_mov_b32_e32 v1, s41 ; GFX8-NEXT: v_mov_b32_e32 v2, s38 ; GFX8-NEXT: v_mov_b32_e32 v3, s39 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_readlane_b32 s2, v62, 4 ; GFX8-NEXT: v_mov_b32_e32 v0, s36 ; GFX8-NEXT: v_mov_b32_e32 v1, s37 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: v_readlane_b32 s2, v62, 4 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_readlane_b32 s3, v62, 5 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s30 ; GFX8-NEXT: v_mov_b32_e32 v1, s31 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 0194e3c6ce37b..61fb7b9f0a3b1 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -6,6 +6,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s + define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry @@ -202,10 +203,10 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; GCN-HSA-NEXT: s_add_u32 s4, s0, 4 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2 ; GCN-HSA-NEXT: flat_store_short v[2:3], v4 @@ -220,10 +221,10 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s0, 4 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s2 ; GCN-NOHSA-VI-NEXT: flat_store_short v[2:3], v4 @@ -348,9 +349,9 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -429,10 +430,10 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -480,12 +481,12 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_endpgm @@ -504,12 +505,12 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -613,8 +614,8 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -633,36 +634,37 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 14 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s3 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_load_ushort v16, v[0:1] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v17, v[2:3] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v18, v[4:5] @@ -671,38 +673,37 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GCN-NOHSA-VI-NEXT: flat_load_ushort v9, v[10:11] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v10, v[12:13] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v11, v[14:15] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s3 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_load_ushort v12, v[0:1] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v13, v[2:3] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v14, v[4:5] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v15, v[6:7] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s0 ; GCN-NOHSA-VI-NEXT: flat_load_ushort v0, v[0:1] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v20, v[2:3] @@ -1802,9 +1803,9 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -1834,9 +1835,9 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 @@ -1957,16 +1958,16 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_ashr_i32 s3, s6, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 ; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 +; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 -; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 @@ -1989,16 +1990,16 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s3, s6, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 @@ -2151,20 +2152,20 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -2209,20 +2210,20 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 @@ -2408,27 +2409,29 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 ; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 ; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 +; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 -; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 +; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 +; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 @@ -2436,8 +2439,6 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 -; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 @@ -2466,27 +2467,29 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s8 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 @@ -2494,8 +2497,6 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 @@ -2741,58 +2742,58 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s31 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s30 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -2853,56 +2854,56 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 @@ -3211,73 +3212,73 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 ; GCN-HSA-NEXT: s_sext_i32_i16 s20, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 ; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 ; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 +; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 ; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 ; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s26 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 ; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -3323,71 +3324,71 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s15 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s13, s13 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s12 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s30 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s8 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s28 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s26 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s3, s3 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s3, s3 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 @@ -3830,25 +3831,25 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s62 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 @@ -3856,20 +3857,19 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s60 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s66 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s65 @@ -3880,14 +3880,14 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s64 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s63 @@ -3903,8 +3903,9 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s68 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s67 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 @@ -3920,40 +3921,40 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s41 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -4047,128 +4048,128 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xf0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xe0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xd0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xc0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xb0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xa0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x90 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s56 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x80 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s54 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s52 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 @@ -4735,56 +4736,56 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xf0 ; GCN-HSA-NEXT: s_sext_i32_i16 s55, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 ; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] ; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] ; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] +; GCN-HSA-NEXT: s_sext_i32_i16 s0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x80 -; GCN-HSA-NEXT: s_sext_i32_i16 s0, s0 ; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 ; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s1, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 @@ -4805,11 +4806,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] ; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 @@ -4821,9 +4821,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s54 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s55 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 @@ -4839,40 +4840,40 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -4936,157 +4937,159 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xf0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s15 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xe0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s13, s13 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s12 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xd0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xc0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s8 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xb0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xa0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x90 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s3, s3 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x80 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s56 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s52 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s53 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s31, s31 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s30, s30 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s29, s29 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s28, s28 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s27, s27 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s26, s26 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s25, s25 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s24, s24 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s23, s23 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s22, s22 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s21, s21 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s20, s20 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s19, s19 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s18, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s19, s19 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s18, s18 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s17, s17 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s16, s16 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 @@ -5094,8 +5097,6 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s17, s17 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s16, s16 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 @@ -6078,8 +6079,8 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6219,9 +6220,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6250,9 +6251,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 @@ -6382,16 +6383,16 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6426,16 +6427,16 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 @@ -6609,20 +6610,20 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6666,21 +6667,21 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 -; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 32 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 -; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 16 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9 @@ -6879,44 +6880,44 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6959,44 +6960,44 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 @@ -7285,23 +7286,23 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s34 -; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s7, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: s_addc_u32 s7, s9, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -7310,30 +7311,30 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s8, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s8, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s8, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s8, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s8, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s8, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 @@ -7389,30 +7390,30 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 64 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x70 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x60 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 @@ -7425,20 +7426,20 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s8, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s9, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s8, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s9, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s8, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s9, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 @@ -7768,96 +7769,96 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xe0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xe0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xc0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xc0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xa0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xa0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x80 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 @@ -7867,8 +7868,8 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s16, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 @@ -7923,87 +7924,87 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xe0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xd0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s35 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xd0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xc0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xc0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xb0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s33 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xb0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xa0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xa0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x90 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x90 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x80 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x80 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x70 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x60 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x50 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 64 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 48 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 @@ -8013,10 +8014,10 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 32 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 @@ -8533,18 +8534,18 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47 ; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s47 ; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s47 ; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s47 ; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s47 ; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47 ; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s54 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s65 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s63 ; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 @@ -8562,11 +8563,11 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27 ; GCN-HSA-NEXT: s_addc_u32 s27, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 ; GCN-HSA-NEXT: s_add_u32 s26, s16, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s49 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s61 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 ; GCN-HSA-NEXT: s_addc_u32 s27, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s72 @@ -8577,38 +8578,38 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s51 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 ; GCN-HSA-NEXT: s_addc_u32 s25, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s46 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 ; GCN-HSA-NEXT: s_add_u32 s14, s16, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 -; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s70 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s71 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s57 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s44 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 +; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s28 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s29 +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s24 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s14 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] @@ -8618,9 +8619,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s12, s16, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -8628,9 +8629,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s10, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -8638,9 +8639,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s8, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -8648,9 +8649,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s6, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -8658,9 +8659,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s4, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 @@ -8738,9 +8739,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s60, s16, 0xf0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s61 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s61, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s62 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s61 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 @@ -8748,9 +8749,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s58, s16, 0xe0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s59, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s72 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s73 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s59 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 @@ -8758,9 +8759,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s54, s16, 0xd0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s55, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s54 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s56 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s54 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s55 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 @@ -8768,9 +8769,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s52, s16, 0xc0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s53, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s52 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s65 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s52 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s53 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 @@ -8778,9 +8779,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s48, s16, 0xb0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s49, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s49 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 @@ -8789,12 +8790,12 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s39, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s38 +; GCN-NOHSA-VI-NEXT: s_add_u32 s38, s16, 0x90 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s46 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s39 -; GCN-NOHSA-VI-NEXT: s_add_u32 s38, s16, 0x90 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s39, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s38 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 @@ -8808,12 +8809,12 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s25, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s24 +; GCN-NOHSA-VI-NEXT: s_add_u32 s24, s16, 0x70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s40 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s25 -; GCN-NOHSA-VI-NEXT: s_add_u32 s24, s16, 0x70 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s25, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 @@ -8827,12 +8828,12 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s21, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NOHSA-VI-NEXT: s_add_u32 s20, s16, 0x50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s21 -; GCN-NOHSA-VI-NEXT: s_add_u32 s20, s16, 0x50 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s21, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 @@ -8845,11 +8846,11 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 @@ -8864,11 +8865,11 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index 9d3a9f1dff8e8..0aabc9af5aa85 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -177,9 +177,9 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm ; @@ -394,10 +394,10 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm ; @@ -458,12 +458,12 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-HSA-NEXT: s_endpgm @@ -482,12 +482,12 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NOHSA-NEXT: s_endpgm @@ -609,15 +609,15 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: s_add_u32 s4, s8, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-HSA-NEXT: flat_store_dword v[4:5], v6 ; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: flat_store_dword v[4:5], v6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 @@ -640,15 +640,15 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NOHSA-NEXT: flat_store_dword v[4:5], v6 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NOHSA-NEXT: flat_store_dword v[4:5], v6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 @@ -791,18 +791,18 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: s_add_u32 s10, s8, 32 ; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s13 +; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: s_add_u32 s4, s8, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-HSA-NEXT: flat_store_dwordx2 v[4:5], v[6:7] ; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: flat_store_dwordx2 v[4:5], v[6:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 @@ -823,18 +823,18 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; GFX8-NOHSA-NEXT: s_add_u32 s10, s8, 32 ; GFX8-NOHSA-NEXT: s_addc_u32 s11, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[4:5], v[6:7] ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[4:5], v[6:7] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 @@ -903,13 +903,13 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[12:13], s[10:11], 0x20 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v10, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v9, s13 ; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v0, s4 -; GFX12-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_mov_b32 v2, s6 -; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 -; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 ; GFX12-NEXT: s_clause 0x2 ; GFX12-NEXT: global_store_b64 v10, v[8:9], s[8:9] offset:32 ; GFX12-NEXT: global_store_b128 v10, v[0:3], s[8:9] offset:16 @@ -991,19 +991,19 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-HSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; GFX7-HSA-NEXT: s_endpgm @@ -1026,17 +1026,17 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; GFX8-NOHSA-NEXT: s_endpgm @@ -1184,27 +1184,28 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: s_add_u32 s10, s8, 32 ; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_add_u32 s4, s8, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s8, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s7 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -1218,27 +1219,28 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; GFX8-NOHSA-NEXT: s_add_u32 s10, s8, 32 ; GFX8-NOHSA-NEXT: s_addc_u32 s11, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s7 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -1302,14 +1304,14 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[12:15], s[10:11], 0x20 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v12, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s13 ; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s15 -; GFX12-NEXT: v_dual_mov_b32 v1, s13 :: v_dual_mov_b32 v2, s14 -; GFX12-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4 -; GFX12-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6 -; GFX12-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0 -; GFX12-NEXT: v_dual_mov_b32 v11, s3 :: v_dual_mov_b32 v10, s2 +; GFX12-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7 +; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v9, s1 +; GFX12-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s3 +; GFX12-NEXT: v_mov_b32_e32 v10, s2 ; GFX12-NEXT: s_clause 0x2 ; GFX12-NEXT: global_store_b128 v12, v[0:3], s[8:9] offset:32 ; GFX12-NEXT: global_store_b128 v12, v[4:7], s[8:9] offset:16 @@ -1390,27 +1392,27 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s19 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 @@ -1432,27 +1434,27 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s19 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 @@ -2257,8 +2259,8 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 @@ -2415,16 +2417,16 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s2 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-HSA-NEXT: s_endpgm @@ -2443,16 +2445,16 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NOHSA-NEXT: s_endpgm @@ -2595,20 +2597,20 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -2838,9 +2840,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX7-HSA-NEXT: s_add_u32 s6, s8, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -2848,9 +2850,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX7-HSA-NEXT: s_add_u32 s4, s8, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -2858,9 +2860,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX7-HSA-NEXT: s_add_u32 s2, s8, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 @@ -2890,9 +2892,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -2900,9 +2902,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -2910,9 +2912,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 @@ -3176,12 +3178,12 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x70 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s19 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s19 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s19 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s19 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-HSA-NEXT: s_add_u32 s14, s16, 64 @@ -3189,19 +3191,19 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX7-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s33 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[13:14], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 48 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[15:16], v[3:6] +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[15:16], v[3:6] ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s18 @@ -3209,9 +3211,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s19 @@ -3219,23 +3221,23 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s14 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[17:18], v[6:9] ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 @@ -3271,9 +3273,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s14, s16, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX8-NOHSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s34 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s15 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -3281,9 +3283,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s13 ; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s30 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -3291,9 +3293,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s10, s16, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 ; GFX8-NOHSA-NEXT: s_addc_u32 s11, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -3301,9 +3303,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NOHSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s27 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s26 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -3311,9 +3313,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s25 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -3321,9 +3323,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -3331,9 +3333,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s2, s16, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 @@ -4187,55 +4189,55 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s35 ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s35 ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s35 ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xd0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s35 ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v36, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s34 ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xc0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[31:32], v[27:30] +; GFX7-HSA-NEXT: v_mov_b32_e32 v36, s35 ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[31:32], v[27:30] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[33:34], v[23:26] ; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s35 ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s35 ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s35 ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xa0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[33:34], v[23:26] +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s35 ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s35 ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s35 ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 ; GFX7-HSA-NEXT: s_add_u32 s28, s16, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX7-HSA-NEXT: s_addc_u32 s29, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s29 ; GFX7-HSA-NEXT: s_add_u32 s28, s16, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s29 ; GFX7-HSA-NEXT: s_addc_u32 s29, s17, 0 ; GFX7-HSA-NEXT: s_add_u32 s24, s16, 0x60 ; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s25 ; GFX7-HSA-NEXT: s_addc_u32 s25, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39 ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s34 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[35:36], v[8:11] ; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s28 ; GFX7-HSA-NEXT: s_add_u32 s24, s16, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s35 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[35:36], v[8:11] +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s29 ; GFX7-HSA-NEXT: s_addc_u32 s25, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[29:30], v[0:3] @@ -4249,7 +4251,6 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s51 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[20:23] ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s49 @@ -4257,16 +4258,17 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_add_u32 s14, s16, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s47 ; GFX7-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[3:6] ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s46 ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[29:30], v[6:9] ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9 @@ -4276,9 +4278,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s44 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -4286,9 +4288,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -4296,9 +4298,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 @@ -4353,9 +4355,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s30, s36, 0xf0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s31 ; GFX8-NOHSA-NEXT: s_addc_u32 s31, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s66 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s65 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s31 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4363,9 +4365,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s28, s36, 0xe0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s29 ; GFX8-NOHSA-NEXT: s_addc_u32 s29, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s28 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s63 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s28 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s29 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4373,9 +4375,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s26, s36, 0xd0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s27 ; GFX8-NOHSA-NEXT: s_addc_u32 s27, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s26 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s62 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s61 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s26 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s27 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4383,9 +4385,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s24, s36, 0xc0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX8-NOHSA-NEXT: s_addc_u32 s25, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s59 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s25 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4393,9 +4395,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s22, s36, 0xb0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s23 ; GFX8-NOHSA-NEXT: s_addc_u32 s23, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s58 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s57 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s23 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4403,9 +4405,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s20, s36, 0xa0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s21 ; GFX8-NOHSA-NEXT: s_addc_u32 s21, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s56 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s55 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s21 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4413,9 +4415,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s18, s36, 0x90 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19 ; GFX8-NOHSA-NEXT: s_addc_u32 s19, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s54 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s53 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s19 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4423,9 +4425,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s16, s36, 0x80 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s17 ; GFX8-NOHSA-NEXT: s_addc_u32 s17, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s52 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s17 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4433,9 +4435,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s14, s36, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX8-NOHSA-NEXT: s_addc_u32 s15, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s15 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4443,9 +4445,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s12, s36, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s13 ; GFX8-NOHSA-NEXT: s_addc_u32 s13, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s47 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4453,9 +4455,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s10, s36, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 ; GFX8-NOHSA-NEXT: s_addc_u32 s11, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s46 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4463,9 +4465,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s8, s36, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NOHSA-NEXT: s_addc_u32 s9, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s44 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4473,9 +4475,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s6, s36, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s42 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4483,9 +4485,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s4, s36, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s40 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s39 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4493,9 +4495,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s2, s36, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s38 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s36 @@ -5092,35 +5094,35 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xd0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0x90 -; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s1 +; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 @@ -5793,20 +5795,21 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x70 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s19 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s19 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 64 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] @@ -5815,9 +5818,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: s_add_u32 s12, s16, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -5825,9 +5828,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -5835,9 +5838,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 @@ -5869,18 +5872,18 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s25 ; GFX8-NOHSA-NEXT: s_addc_u32 s25, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s26 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s27 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s25 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NOHSA-NEXT: s_add_u32 s20, s36, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX8-NOHSA-NEXT: s_addc_u32 s21, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s23 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s21 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -5888,9 +5891,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX8-NOHSA-NEXT: s_add_u32 s16, s36, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NOHSA-NEXT: s_addc_u32 s17, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s17 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -5898,9 +5901,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX8-NOHSA-NEXT: s_add_u32 s12, s36, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NOHSA-NEXT: s_addc_u32 s13, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -5908,9 +5911,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX8-NOHSA-NEXT: s_add_u32 s8, s36, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NOHSA-NEXT: s_addc_u32 s9, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -5918,9 +5921,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX8-NOHSA-NEXT: s_add_u32 s4, s36, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s36 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll index 542b0ccedbf14..bbf73c7e9c892 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -69,9 +69,9 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %ld = load i64, ptr addrspace(4) %in @@ -149,10 +149,10 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -198,12 +198,12 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, s8 ; GFX7-NEXT: v_mov_b32_e32 v6, s9 -; GFX7-NEXT: flat_store_dwordx2 v[3:4], v[5:6] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: flat_store_dwordx2 v[3:4], v[5:6] ; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: s_endpgm @@ -221,12 +221,12 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, s8 ; GFX8-NEXT: v_mov_b32_e32 v6, s9 -; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[5:6] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[5:6] ; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -317,12 +317,12 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX7-NEXT: v_mov_b32_e32 v6, s2 ; GFX7-NEXT: v_mov_b32_e32 v7, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-NEXT: s_endpgm @@ -341,12 +341,12 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v6, s2 ; GFX8-NEXT: v_mov_b32_e32 v7, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm @@ -444,27 +444,27 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; GFX7-NEXT: v_mov_b32_e32 v6, s18 ; GFX7-NEXT: v_mov_b32_e32 v7, s19 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-NEXT: s_add_u32 s8, s16, 32 ; GFX7-NEXT: v_mov_b32_e32 v0, s12 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-NEXT: v_mov_b32_e32 v3, s15 -; GFX7-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-NEXT: s_add_u32 s8, s16, 32 ; GFX7-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX7-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX7-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-NEXT: v_mov_b32_e32 v7, s11 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s4, s16, 16 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: v_mov_b32_e32 v4, s16 @@ -486,27 +486,27 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; GFX8-NEXT: v_mov_b32_e32 v6, s18 ; GFX8-NEXT: v_mov_b32_e32 v7, s19 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: s_add_u32 s8, s16, 32 ; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: s_add_u32 s8, s16, 32 ; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX8-NEXT: s_addc_u32 s9, s17, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v6, s10 ; GFX8-NEXT: v_mov_b32_e32 v7, s11 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s4, s16, 16 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s16 @@ -680,20 +680,21 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_add_u32 s18, s16, 0x70 ; GFX7-NEXT: s_addc_u32 s19, s17, 0 ; GFX7-NEXT: v_mov_b32_e32 v16, s18 -; GFX7-NEXT: v_mov_b32_e32 v17, s19 ; GFX7-NEXT: s_add_u32 s18, s16, 0x60 -; GFX7-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX7-NEXT: v_mov_b32_e32 v17, s19 ; GFX7-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX7-NEXT: s_nop 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-NEXT: s_add_u32 s18, s16, 0x50 -; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-NEXT: s_add_u32 s18, s16, 64 -; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GFX7-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GFX7-NEXT: v_mov_b32_e32 v0, s18 ; GFX7-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[12:15] @@ -702,9 +703,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_add_u32 s12, s16, 48 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-NEXT: s_addc_u32 s13, s17, 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-NEXT: v_mov_b32_e32 v5, s13 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: s_nop 0 @@ -712,9 +713,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_add_u32 s8, s16, 32 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: s_nop 0 @@ -722,9 +723,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_add_u32 s4, s16, 16 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: v_mov_b32_e32 v4, s16 @@ -756,18 +757,18 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s25 ; GFX8-NEXT: s_addc_u32 s25, s37, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NEXT: v_mov_b32_e32 v6, s26 ; GFX8-NEXT: v_mov_b32_e32 v7, s27 +; GFX8-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NEXT: v_mov_b32_e32 v1, s25 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: s_add_u32 s20, s36, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v1, s21 ; GFX8-NEXT: s_addc_u32 s21, s37, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NEXT: v_mov_b32_e32 v2, s22 ; GFX8-NEXT: v_mov_b32_e32 v3, s23 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NEXT: v_mov_b32_e32 v5, s21 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -775,9 +776,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_add_u32 s16, s36, 64 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: s_addc_u32 s17, s37, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NEXT: v_mov_b32_e32 v5, s17 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -785,9 +786,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_add_u32 s12, s36, 48 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: s_addc_u32 s13, s37, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -795,9 +796,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_add_u32 s8, s36, 32 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: s_addc_u32 s9, s37, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -805,9 +806,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_add_u32 s4, s36, 16 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s37, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s36 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index a71a5bbf95645..62d99adc1405c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -6,6 +6,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s + ; TODO: NOT AND define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_i8: @@ -228,14 +229,14 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: s_add_u32 s0, s0, 2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_lshr_b32 s0, s2, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-HSA-NEXT: flat_store_short v[0:1], v4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: flat_store_byte v[2:3], v0 @@ -247,14 +248,14 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NOHSA-NEXT: flat_store_short v[0:1], v4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NOHSA-NEXT: flat_store_byte v[2:3], v0 @@ -454,9 +455,9 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -535,10 +536,10 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -1554,9 +1555,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -1586,9 +1587,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 @@ -1716,9 +1717,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -1748,9 +1749,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 @@ -1909,20 +1910,20 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -1967,20 +1968,20 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s17 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 @@ -2174,24 +2175,25 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s6 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 +; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 @@ -2199,7 +2201,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 @@ -2232,24 +2233,25 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s7, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s6, s6 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5 @@ -2257,7 +2259,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 @@ -2509,58 +2510,58 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_u32 s11, s11, 0x80010 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s24 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -2621,56 +2622,56 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s33 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s25 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s31 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s29 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s28 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 @@ -2986,66 +2987,67 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_i32 s36, s11, 0x80008 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 ; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s33 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-HSA-NEXT: s_sext_i32_i8 s9, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s30 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GFX7-HSA-NEXT: s_sext_i32_i8 s9, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: s_sext_i32_i8 s8, s8 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s6 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 +; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 @@ -3053,7 +3055,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 @@ -3098,64 +3099,65 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s11, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s34 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s10, s10 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s33 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s31 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s30 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s9, s9 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s8, s8 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s26 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s24 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s7, s7 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s6, s6 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5 @@ -3163,7 +3165,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 @@ -3611,25 +3612,25 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_u32 s15, s15, 0x80010 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xd0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s66 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s54 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s13 @@ -3637,31 +3638,31 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s65 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x90 ; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s52 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s68 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s58 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s57 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x70 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s63 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s47 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 @@ -3677,30 +3678,30 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s55 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] ; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s46 ; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s59 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s44 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s45 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s42 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39 ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s33 @@ -3713,13 +3714,13 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 64 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_add_u32 s4, s16, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 48 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s26 @@ -3731,12 +3732,12 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s16, 32 ; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -3830,93 +3831,93 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xe0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s54 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s53 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xe0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xd0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s67 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s52 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xd0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xc0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s66 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xc0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xb0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s65 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s47 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xb0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xa0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s46 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xa0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x90 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s63 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s44 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s43 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x90 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x80 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s62 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s42 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x80 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s61 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s40 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s39 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x70 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s59 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s38 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s37 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x60 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s57 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s36 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s58 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x50 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s56 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s33 @@ -3929,13 +3930,13 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 64 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s55 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s28 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 48 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s26 @@ -3947,12 +3948,12 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s16, 32 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s16, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 @@ -4517,46 +4518,46 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xf0 ; GFX7-HSA-NEXT: s_sext_i32_i8 s50, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xd0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9 ; GFX7-HSA-NEXT: s_sext_i32_i8 s13, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s62 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s61 ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s60 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] ; GFX7-HSA-NEXT: s_sext_i32_i8 s12, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s9 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s59 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s58 ; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] ; GFX7-HSA-NEXT: s_sext_i32_i8 s15, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7 ; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 @@ -4574,49 +4575,49 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] -; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s7 +; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 +; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s65 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s53 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s52 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 +; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s50 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s49 ; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s46 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s47 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s45 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 -; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s44 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s43 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s35 @@ -4628,15 +4629,15 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 64 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_add_u32 s4, s16, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 48 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: s_sext_i32_i8 s2, s2 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s28 @@ -4647,13 +4648,14 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s16, 32 ; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GFX7-HSA-NEXT: s_sext_i32_i8 s0, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s1 @@ -4661,7 +4663,6 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_sext_i32_i8 s0, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -4730,107 +4731,107 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_bfe_i32 s66, s15, 0x80008 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xf0 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s15, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s15, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xe0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s66 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xe0 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s14, s14 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xd0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s65 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s63 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xd0 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s13, s13 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xc0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s62 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s61 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xc0 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s12, s12 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xb0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s59 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s58 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s57 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xb0 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s11, s11 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xa0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s56 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s55 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s54 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xa0 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s10, s10 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x90 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s53 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s52 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x90 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s9, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x80 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s49 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x80 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s8, s8 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s47 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s46 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x70 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s44 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s43 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s42 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x60 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x50 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s40 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s39 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s38 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s37 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x50 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s36 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s35 @@ -4842,15 +4843,15 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 64 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s33 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s31 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 48 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28 @@ -4861,13 +4862,14 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s16, 32 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s16, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s26 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1 @@ -4875,7 +4877,6 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -6067,9 +6068,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX7-HSA-NEXT: s_add_u32 s4, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6098,9 +6099,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6231,17 +6232,17 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6275,17 +6276,17 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6463,11 +6464,11 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6477,13 +6478,13 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6520,11 +6521,11 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6534,13 +6535,13 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6741,37 +6742,37 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6779,11 +6780,11 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6821,44 +6822,44 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s17 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 @@ -7159,22 +7160,23 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s6, s0, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 @@ -7182,29 +7184,28 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x60 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: s_add_u32 s6, s0, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s29 @@ -7262,21 +7263,21 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 64 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s35 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 16 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 @@ -7296,9 +7297,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s33 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -7306,9 +7307,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -7316,10 +7317,10 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s29 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -7656,96 +7657,96 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xd0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s7 ; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s21 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 48 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xe0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xe0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xc0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xc0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xa0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xa0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x80 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x80 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x60 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 64 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s29 @@ -7811,94 +7812,94 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xb0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x70 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s17 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 48 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xe0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xe0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xd0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s35 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xd0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xc0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xc0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xa0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s21 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xa0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x90 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s33 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x90 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x80 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s23 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x80 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s31 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x60 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x50 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 64 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s29 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s27 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 32 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28 @@ -8443,75 +8444,75 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s51 ; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51 ; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51 ; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 ; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s38 ; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s63 ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s39 ; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s60 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s72 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX7-HSA-NEXT: s_add_u32 s34, s8, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s73 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s60 ; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s38 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 ; GFX7-HSA-NEXT: s_addc_u32 s35, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s58 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s59 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s22 +; GFX7-HSA-NEXT: s_add_u32 s22, s8, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s54 ; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s55 ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s56 ; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s38 ; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s39 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s22 -; GFX7-HSA-NEXT: s_add_u32 s22, s8, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s23 ; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s23 ; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xf0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s23 ; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s23 ; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xe0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s71 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s52 ; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s50 ; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s29 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s23 ; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s47 ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s23 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s21 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] @@ -8520,9 +8521,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0xb0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s46 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -8530,9 +8531,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s14, s8, 0xa0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX7-HSA-NEXT: s_addc_u32 s15, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s15 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -8540,9 +8541,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s12, s8, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: s_addc_u32 s13, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s44 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -8550,9 +8551,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s6, s8, 0x60 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -8560,9 +8561,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s4, s8, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s37 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -8570,9 +8571,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -8648,21 +8649,21 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s47 ; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xc0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 -; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xc0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0x90 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s68 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s69 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 -; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0x90 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s52 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s53 @@ -8676,12 +8677,12 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s39 ; GFX8-NOHSA-NEXT: s_addc_u32 s39, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s38 +; GFX8-NOHSA-NEXT: s_add_u32 s38, s8, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s56 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s57 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s39 -; GFX8-NOHSA-NEXT: s_add_u32 s38, s8, 0x50 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s39, s9, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s38 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s58 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s59 @@ -8695,12 +8696,12 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31 ; GFX8-NOHSA-NEXT: s_addc_u32 s31, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30 +; GFX8-NOHSA-NEXT: s_add_u32 s30, s8, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s40 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s31 -; GFX8-NOHSA-NEXT: s_add_u32 s30, s8, 16 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s31, s9, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s37 @@ -8721,12 +8722,12 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 0xe0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s65 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 0xe0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 @@ -8740,12 +8741,12 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 0xa0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s63 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 0xa0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 @@ -8758,11 +8759,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s44 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 @@ -8777,11 +8778,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s42 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 @@ -10594,10 +10595,10 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -10650,10 +10651,10 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 @@ -10968,9 +10969,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -11041,9 +11042,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 @@ -11452,21 +11453,21 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 48 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -11558,21 +11559,21 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s21 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 @@ -12146,20 +12147,20 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -12287,20 +12288,20 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll index c119ef274bb04..7f26738eb0aac 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll @@ -260,16 +260,16 @@ define amdgpu_kernel void @global_load_v8f32(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 @@ -336,27 +336,27 @@ define amdgpu_kernel void @global_load_v9f32(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dword v14, v[6:7] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2) ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -426,27 +426,27 @@ define amdgpu_kernel void @global_load_v10f32(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: flat_load_dwordx2 v[8:9], v[8:9] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2) ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] @@ -517,27 +517,27 @@ define amdgpu_kernel void @global_load_v11f32(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: flat_load_dwordx3 v[8:10], v[8:9] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2) ; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[0:3] @@ -607,27 +607,27 @@ define amdgpu_kernel void @global_load_v12f32(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2) ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] @@ -702,16 +702,16 @@ define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s2, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 @@ -720,19 +720,19 @@ define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s5 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s4 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2) ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 388006281abdc..1390377ad6499 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -5,6 +5,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=EG %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cayman < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=CM %s + ; FIXME: r600 is broken because the bigger testcases spill and it's not implemented define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { @@ -235,8 +236,8 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-NEXT: s_add_u32 s2, s0, 4 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: flat_store_short v[4:5], v1 @@ -543,18 +544,18 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] @@ -696,18 +697,18 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] @@ -1893,8 +1894,8 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3 @@ -2045,8 +2046,8 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v3 @@ -2209,24 +2210,24 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v1 @@ -2446,25 +2447,25 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) @@ -2722,30 +2723,30 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 -; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 @@ -2760,23 +2761,22 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v12 ; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v13 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v14 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7 @@ -2786,6 +2786,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15 ; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v15 ; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) @@ -3130,42 +3131,42 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v13 @@ -3177,13 +3178,13 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v15 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v14 ; GCN-HSA-NEXT: v_bfe_i32 v17, v15, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v15, v14, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[15:18] ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) @@ -3196,8 +3197,6 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_bfe_i32 v17, v9, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v15, v8, 0, 16 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[15:18] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[11:14] ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) @@ -3209,16 +3208,18 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v4 ; GCN-HSA-NEXT: v_bfe_i32 v13, v5, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v11, v4, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[11:14] ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v0 ; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v7, v0, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v2 ; GCN-HSA-NEXT: v_bfe_i32 v5, v3, 0, 16 @@ -3664,23 +3665,23 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s8, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s9 ; GCN-HSA-NEXT: s_add_u32 s10, s2, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s8 ; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s8 ; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[16:17] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 ; GCN-HSA-NEXT: s_add_u32 s10, s2, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 ; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: s_add_u32 s10, s2, 0x60 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -3688,8 +3689,8 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[12:13] ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s6 ; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[12:13] ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[14:15] @@ -3698,20 +3699,20 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v25 ; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v24 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v25 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 @@ -3733,21 +3734,21 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0 ; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v9 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v8 ; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v9 ; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v11 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v10 ; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v11 ; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[24:27] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 @@ -3755,14 +3756,14 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v5 ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v6 ; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v7 ; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) @@ -3770,9 +3771,10 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v29 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v28 ; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v29 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v31 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v30 ; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v31 @@ -3780,20 +3782,18 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[7:10] ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v3 ; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21 ; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v21 ; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v20 @@ -3804,42 +3804,43 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v22 ; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v23 ; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v22 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(12) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v18 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: s_waitcnt vmcnt(12) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v17 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v17 ; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v12 ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v13 ; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v17 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[15:18] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v14 ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v14 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -4468,28 +4469,28 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 64 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 @@ -4505,13 +4506,13 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 16, v29 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 16, v28 -; GCN-HSA-NEXT: v_bfe_i32 v34, v29, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v32, v28, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 16, v29 +; GCN-HSA-NEXT: v_bfe_i32 v34, v29, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s3 @@ -4525,87 +4526,86 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[31:34] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v20 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v21 ; GCN-HSA-NEXT: v_bfe_i32 v30, v21, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v28, v20, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[35:36], v[28:31] -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v34, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[35:36], v[28:31] +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v23 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v22 ; GCN-HSA-NEXT: v_bfe_i32 v30, v23, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v28, v22, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[28:31] -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v38, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v39, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v13 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v12 ; GCN-HSA-NEXT: v_bfe_i32 v22, v13, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v20, v12, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v38, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v15 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v14 ; GCN-HSA-NEXT: v_bfe_i32 v30, v15, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v28, v14, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v5 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v4 ; GCN-HSA-NEXT: v_bfe_i32 v14, v5, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v12, v4, 0, 16 +; GCN-HSA-NEXT: s_waitcnt vmcnt(7) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v7 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v6 ; GCN-HSA-NEXT: v_bfe_i32 v22, v7, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v20, v6, 0, 16 -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0 ; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v2 ; GCN-HSA-NEXT: v_bfe_i32 v22, v3, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v20, v2, 0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] ; GCN-HSA-NEXT: s_waitcnt vmcnt(11) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v8 -; GCN-HSA-NEXT: v_bfe_i32 v2, v9, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v8, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9 +; GCN-HSA-NEXT: v_bfe_i32 v2, v9, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v24 @@ -4614,9 +4614,9 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v10 ; GCN-HSA-NEXT: v_bfe_i32 v23, v11, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v21, v10, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[21:24] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: s_waitcnt vmcnt(12) @@ -4628,25 +4628,26 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v16 ; GCN-HSA-NEXT: v_bfe_i32 v19, v17, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v17, v16, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[17:20] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v25 ; GCN-HSA-NEXT: v_bfe_i32 v14, v25, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v27 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v26 ; GCN-HSA-NEXT: v_bfe_i32 v6, v27, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v26, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_endpgm @@ -5861,14 +5862,14 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx2 v[8:9], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v9 @@ -6013,17 +6014,18 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -6173,24 +6175,24 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v3 @@ -6395,17 +6397,17 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 @@ -6656,11 +6658,11 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8 @@ -6672,9 +6674,9 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v5 @@ -6686,26 +6688,22 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[17:20] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v3 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v3 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[7:10] ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v2 ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, v8 @@ -6713,9 +6711,13 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, v8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v0 ; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v0 @@ -7039,33 +7041,33 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v7 @@ -7506,17 +7508,17 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[2:5], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: flat_load_dwordx4 v[2:5], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[6:9], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 -; GCN-HSA-NEXT: flat_load_dwordx4 v[10:13], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[10:13], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[14:17], v[0:1] @@ -7534,12 +7536,12 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14 ; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, v1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) @@ -7575,9 +7577,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v17 ; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s9 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v15 ; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v15 @@ -7587,67 +7589,67 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v12 ; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v10 ; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v10 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v16 ; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v16 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[9:12] ; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v13 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v14 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[21:24] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[21:24] ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v8 ; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[18:21] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[18:21] ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 ; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v6 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, v1 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[15:18] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[5:8] ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12] @@ -8219,12 +8221,12 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -8233,9 +8235,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 @@ -8253,115 +8255,118 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_bfe_i32 v15, v15, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[15:18] -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 -; GCN-HSA-NEXT: v_bfe_i32 v15, v13, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GCN-HSA-NEXT: v_bfe_i32 v15, v13, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v13 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v13 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v14 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[15:18] -; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 ; GCN-HSA-NEXT: v_bfe_i32 v15, v13, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v13, v14, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[13:16] +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[13:16] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: v_bfe_i32 v12, v12, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[12:15] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v11 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v11 ; GCN-HSA-NEXT: v_bfe_i32 v11, v11, 0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v10 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 +; GCN-HSA-NEXT: v_bfe_i32 v15, v10, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v11, v9, 0, 16 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: v_bfe_i32 v17, v27, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 +; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v9 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v9 -; GCN-HSA-NEXT: v_bfe_i32 v15, v10, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-HSA-NEXT: v_bfe_i32 v10, v28, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v17, v27, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[11:14] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v14, v19, 0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[15:18] ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[8:11] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_bfe_i32 v14, v19, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[12:15] ; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v3 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v1 -; GCN-HSA-NEXT: v_bfe_i32 v20, v1, 0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_bfe_i32 v24, v0, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v26, v27, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v1 +; GCN-HSA-NEXT: v_bfe_i32 v20, v1, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v24 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v26 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v20 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v4 ; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GCN-HSA-NEXT: v_bfe_i32 v18, v18, 0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[10:13] ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_bfe_i32 v10, v19, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v5 @@ -8369,19 +8374,16 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v7 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v7 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCN-HSA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index 7203545ebf9a8..1706d1b7b97fa 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -6,6 +6,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX900-HSA %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX908-HSA %s + define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_i32: ; SI-NOHSA: ; %bb.0: ; %entry @@ -368,17 +369,17 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, s0 @@ -478,27 +479,27 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCNX3-HSA-NEXT: flat_load_dword v14, v[8:9] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -612,27 +613,27 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCNX3-HSA-NEXT: flat_load_dwordx2 v[8:9], v[8:9] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] @@ -745,27 +746,27 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCNX3-HSA-NEXT: flat_load_dwordx3 v[8:10], v[8:9] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s0 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[11:12], v[0:3] @@ -883,27 +884,27 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] @@ -1020,17 +1021,17 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s4 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -1038,17 +1039,17 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] @@ -1732,8 +1733,8 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2 @@ -1861,8 +1862,8 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v3 @@ -2003,22 +2004,22 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1) @@ -2196,25 +2197,25 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1) @@ -2431,42 +2432,42 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v9 @@ -2474,17 +2475,17 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v10 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v10 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v11 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4) @@ -2497,8 +2498,6 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v14 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v15 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(5) @@ -2510,16 +2509,18 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v6 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v7 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s1 +; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[8:11] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s1 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(6) ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v1 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s0 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2 @@ -2802,12 +2803,12 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 48 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -2819,52 +2820,52 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v1 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v5 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v6 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v7 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(5) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9 -; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v10 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v11 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[2:3], v[16:19] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -3203,30 +3204,30 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x60 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x50 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 64 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[8:9] ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[8:9] ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] @@ -3235,52 +3236,51 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v29 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v28 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, v28 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, v29 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v29 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, v29 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xf0 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v31 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v30 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, v30 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, v31 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[32:35] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(8) ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v25 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xd0 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v24 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v24 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v25 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v27 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v26 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v26 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v27 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[28:31] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v39, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v38, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9) @@ -3288,6 +3288,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v20 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v20 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v21 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v39, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v23 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v22 @@ -3296,86 +3297,86 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[24:27] ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v15 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v14 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v13 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v12 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v12 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v13 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, v14 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, v15 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v15 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v14 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, v14 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, v15 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v7 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v6 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v6 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v7 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[23:26] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v17 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v16 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v16 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v17 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v17 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[15:16], v[4:7] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v19 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v18 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v18 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v19 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[15:16], v[23:26] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v9 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v8 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, v8 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, v9 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v9 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, v9 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[15:18] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v1 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v0 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v11 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v11 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, v1 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v3 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v2 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v10 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v10 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, v3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] @@ -3663,7 +3664,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v12, 0 ; GCN-GFX900-HSA-NEXT: s_add_u32 s20, s20, s17 -; GCN-GFX900-HSA-NEXT: s_addc_u32 s21, s21, 0 ; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:96 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[8:11], v12, s[2:3] offset:112 @@ -3672,6 +3672,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[17:20], v12, s[2:3] offset:48 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3] offset:32 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] offset:16 +; GCN-GFX900-HSA-NEXT: s_addc_u32 s21, s21, 0 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v7 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v6 @@ -3833,13 +3834,10 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v19, v24 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3] ; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a0, v29 -; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a3, v32 ; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a1, v30 ; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a2, v31 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240 +; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a3, v32 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v28 -; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v36, a3 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v27 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v44, 31, v26 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v25 @@ -3847,7 +3845,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v43, v26 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v29, v27 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v31, v28 -; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v3 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v2 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v1 @@ -3856,10 +3854,13 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v27, v1 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v57, v2 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v59, v3 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192 +; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v36, a3 ; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v35, a2 ; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v34, a1 ; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v33, a0 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v24 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v23 @@ -3988,15 +3989,15 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_add_u32 s8, s2, 48 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s9, s3, 0 ; GCNX3-HSA-NEXT: s_add_u32 s10, s2, 64 ; GCNX3-HSA-NEXT: s_addc_u32 s11, s3, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_add_u32 s12, s2, 0x50 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s13, s3, 0 ; GCNX3-HSA-NEXT: s_add_u32 s14, s2, 0x60 ; GCNX3-HSA-NEXT: s_addc_u32 s15, s3, 0 @@ -4018,8 +4019,8 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] @@ -4033,74 +4034,73 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v30 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v31 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v31 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(8) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v33 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v34 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v35 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v35 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v24 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v25 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] -; GCNX3-HSA-NEXT: s_nop 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v26 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v27 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v20 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v21 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[0:3] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v22 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v23 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(11) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v16 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v17 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCNX3-HSA-NEXT: s_nop 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v18 @@ -4109,35 +4109,35 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v12 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v13 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v14 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v15 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v8 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v9 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v9 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v10 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v11 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v6 @@ -4516,34 +4516,34 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 48 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s5 -; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s4 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s5 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x50 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s4 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 64 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s4 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x70 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s5 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 0x60 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] -; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] @@ -4551,17 +4551,17 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s0 ; GCNX3-HSA-NEXT: s_add_u32 s4, s0, 0x70 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s0 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3] ; GCNX3-HSA-NEXT: s_nop 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_add_u32 s6, s0, 0x50 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s1, 0 @@ -4581,12 +4581,12 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[6:7], v[8:11] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, s6 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, s7 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[6:7], v[12:15] diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index 0c399d65d01cc..4974e74eecd36 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -5,6 +5,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=EG %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cayman < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=CM %s + ; TODO: NOT AND define amdgpu_kernel void @global_load_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_load_i8: @@ -253,13 +254,13 @@ define amdgpu_kernel void @global_load_v3i8(ptr addrspace(1) %out, ptr addrspace ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dword v2, v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GCN-HSA-NEXT: flat_store_short v[0:1], v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_byte v[0:1], v3 ; GCN-HSA-NEXT: s_endpgm @@ -1754,8 +1755,8 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i32(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v9 @@ -1912,8 +1913,8 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i32(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 24, v7 @@ -2085,14 +2086,14 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) @@ -2100,19 +2101,19 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_u32 v8, v3, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v3 ; GCN-HSA-NEXT: v_bfe_u32 v9, v3, 16, 8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GCN-HSA-NEXT: v_bfe_u32 v4, v0, 8, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[7:10] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 24, v2 +; GCN-HSA-NEXT: v_bfe_u32 v12, v2, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v2 +; GCN-HSA-NEXT: v_bfe_u32 v13, v2, 16, 8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GCN-HSA-NEXT: v_bfe_u32 v4, v0, 8, 8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 24, v1 ; GCN-HSA-NEXT: v_bfe_u32 v8, v1, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v12, v2, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GCN-HSA-NEXT: v_bfe_u32 v5, v0, 16, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v1 ; GCN-HSA-NEXT: v_bfe_u32 v9, v1, 16, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v2 -; GCN-HSA-NEXT: v_bfe_u32 v13, v2, 16, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14] @@ -2324,14 +2325,14 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) @@ -2339,19 +2340,19 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v9, v3, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v8, v3, 0, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v2 +; GCN-HSA-NEXT: v_bfe_i32 v14, v2, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v13, v2, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 24, v0 ; GCN-HSA-NEXT: v_bfe_i32 v6, v0, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v5, v0, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 24, v1 ; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 8 -; GCN-HSA-NEXT: v_bfe_i32 v14, v2, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v13, v2, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] @@ -2606,17 +2607,17 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 @@ -2626,22 +2627,22 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v7 ; GCN-HSA-NEXT: v_bfe_u32 v10, v7, 16, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 24, v6 ; GCN-HSA-NEXT: v_bfe_u32 v8, v6, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v6 ; GCN-HSA-NEXT: v_bfe_u32 v9, v6, 16, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v5 ; GCN-HSA-NEXT: v_bfe_u32 v7, v5, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v5 ; GCN-HSA-NEXT: v_bfe_u32 v8, v5, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[6:9] ; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff, v4 @@ -2655,23 +2656,23 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GCN-HSA-NEXT: v_bfe_u32 v5, v3, 8, 8 ; GCN-HSA-NEXT: v_bfe_u32 v6, v3, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; GCN-HSA-NEXT: v_bfe_u32 v9, v0, 8, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v2 +; GCN-HSA-NEXT: v_bfe_u32 v13, v2, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v2 +; GCN-HSA-NEXT: v_bfe_u32 v14, v2, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v0 +; GCN-HSA-NEXT: v_bfe_u32 v9, v0, 8, 8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GCN-HSA-NEXT: v_bfe_u32 v4, v1, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v13, v2, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v0 ; GCN-HSA-NEXT: v_bfe_u32 v10, v0, 16, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GCN-HSA-NEXT: v_bfe_u32 v5, v1, 16, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v2 -; GCN-HSA-NEXT: v_bfe_u32 v14, v2, 16, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[3:6] @@ -3017,17 +3018,17 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0 @@ -3037,15 +3038,14 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_i32 v8, v7, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v7, v7, 0, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 24, v6 ; GCN-HSA-NEXT: v_bfe_i32 v8, v6, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v6, v6, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[6:9] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 @@ -3053,6 +3053,7 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_i32 v7, v5, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[5:8] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 @@ -3066,23 +3067,23 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v9, v3, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v8, v3, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[8:11] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v2 +; GCN-HSA-NEXT: v_bfe_i32 v14, v2, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v13, v2, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 24, v0 ; GCN-HSA-NEXT: v_bfe_i32 v6, v0, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v5, v0, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[8:11] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 24, v1 ; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 8 -; GCN-HSA-NEXT: v_bfe_i32 v14, v2, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v13, v2, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] @@ -3571,117 +3572,117 @@ define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_u32 v18, v14, 16, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v15 ; GCN-HSA-NEXT: v_bfe_u32 v17, v15, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v15 ; GCN-HSA-NEXT: v_bfe_u32 v18, v15, 16, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[16:19] ; GCN-HSA-NEXT: v_bfe_u32 v15, v12, 8, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 24, v12 ; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff, v12 ; GCN-HSA-NEXT: v_bfe_u32 v16, v12, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[14:17] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 24, v13 ; GCN-HSA-NEXT: v_bfe_u32 v15, v13, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff, v13 ; GCN-HSA-NEXT: v_bfe_u32 v16, v13, 16, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[14:17] ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-HSA-NEXT: v_bfe_u32 v13, v10, 8, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s11 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v10 ; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v10 ; GCN-HSA-NEXT: v_bfe_u32 v14, v10, 16, 8 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s10 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v11 ; GCN-HSA-NEXT: v_bfe_u32 v13, v11, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v11 ; GCN-HSA-NEXT: v_bfe_u32 v14, v11, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s12 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v8 ; GCN-HSA-NEXT: v_bfe_u32 v11, v8, 8, 8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v8 ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v8 ; GCN-HSA-NEXT: v_bfe_u32 v12, v8, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[10:13] ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v9 ; GCN-HSA-NEXT: v_bfe_u32 v11, v9, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v9 ; GCN-HSA-NEXT: v_bfe_u32 v12, v9, 16, 8 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[10:13] ; GCN-HSA-NEXT: v_bfe_u32 v9, v3, 8, 8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v3 ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v3 ; GCN-HSA-NEXT: v_bfe_u32 v10, v3, 16, 8 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[8:11] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 24, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v0 ; GCN-HSA-NEXT: v_bfe_u32 v9, v0, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v0 ; GCN-HSA-NEXT: v_bfe_u32 v10, v0, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_bfe_u32 v18, v1, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v1 ; GCN-HSA-NEXT: v_bfe_u32 v19, v1, 16, 8 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v5 -; GCN-HSA-NEXT: v_bfe_u32 v14, v5, 8, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[8:11] -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v5 +; GCN-HSA-NEXT: s_waitcnt vmcnt(11) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v4 ; GCN-HSA-NEXT: v_bfe_u32 v9, v4, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v15, v5, 16, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v4 ; GCN-HSA-NEXT: v_bfe_u32 v10, v4, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_bfe_u32 v14, v5, 8, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[17:20] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 24, v6 ; GCN-HSA-NEXT: v_bfe_u32 v18, v6, 8, 8 +; GCN-HSA-NEXT: v_bfe_u32 v15, v5, 16, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v6 ; GCN-HSA-NEXT: v_bfe_u32 v19, v6, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 24, v7 ; GCN-HSA-NEXT: v_bfe_u32 v1, v7, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xff, v7 ; GCN-HSA-NEXT: v_bfe_u32 v2, v7, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[13:16] @@ -4332,9 +4333,9 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 24, v14 @@ -4348,13 +4349,13 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_i32 v16, v15, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v15, v15, 0, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[15:18] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[15:18] +; GCN-HSA-NEXT: v_bfe_i32 v14, v12, 0, 8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 24, v12 ; GCN-HSA-NEXT: v_bfe_i32 v16, v12, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v15, v12, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v14, v12, 0, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[14:17] @@ -4362,13 +4363,13 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 24, v13 ; GCN-HSA-NEXT: v_bfe_i32 v15, v13, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v14, v13, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[13:16] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 @@ -4380,10 +4381,10 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_i32 v14, v10, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v13, v10, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v12, v10, 0, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v11 ; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 16, 8 @@ -4398,59 +4399,60 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v11, v8, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v10, v8, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[10:13] ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[10:13] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 24, v9 ; GCN-HSA-NEXT: v_bfe_i32 v11, v9, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v10, v9, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v9, v9, 0, 8 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[9:12] ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v6 ; GCN-HSA-NEXT: v_bfe_i32 v13, v6, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v12, v6, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v11, v6, 0, 8 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 24, v7 ; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v8, v7, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v7, v7, 0, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 24, v4 ; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v7, v4, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v6, v4, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v1 -; GCN-HSA-NEXT: v_bfe_i32 v13, v1, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v12, v1, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v11, v1, 0, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[6:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[6:9] +; GCN-HSA-NEXT: s_waitcnt vmcnt(11) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 24, v0 ; GCN-HSA-NEXT: v_bfe_i32 v8, v0, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v7, v0, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v6, v0, 0, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_bfe_i32 v13, v1, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v12, v1, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v11, v1, 0, 8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 24, v5 ; GCN-HSA-NEXT: v_bfe_i32 v17, v5, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v15, v5, 0, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 24, v3 ; GCN-HSA-NEXT: v_bfe_i32 v17, v3, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v16, v3, 8, 8 @@ -4460,19 +4462,18 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[6:9] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[6:9] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] @@ -5751,14 +5752,14 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dword v0, v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_bfe_u32 v6, v0, 8, 8 @@ -5908,8 +5909,8 @@ define amdgpu_kernel void @global_sextload_v4i8_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v0 @@ -6077,30 +6078,30 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 24, v16 ; GCN-HSA-NEXT: v_bfe_u32 v0, v16, 16, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v15 ; GCN-HSA-NEXT: v_bfe_u32 v3, v15, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 ; GCN-HSA-NEXT: v_bfe_u32 v9, v16, 8, 8 ; GCN-HSA-NEXT: v_bfe_u32 v13, v15, 8, 8 @@ -6330,7 +6331,6 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_lshr_b32 s10, s3, 8 ; GCN-HSA-NEXT: s_lshr_b32 s12, s2, 8 ; GCN-HSA-NEXT: s_ashr_i32 s13, s2, 31 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[2:3], 0x80000 ; GCN-HSA-NEXT: s_ashr_i32 s16, s2, 24 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x80000 @@ -6338,35 +6338,36 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_endpgm @@ -6611,11 +6612,11 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v5 @@ -6627,51 +6628,51 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_u32 v8, v3, 16, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[8:11] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: v_bfe_u32 v11, v0, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] ; GCN-HSA-NEXT: v_mov_b32_e32 v15, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v1 ; GCN-HSA-NEXT: v_bfe_u32 v14, v1, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[14:17] ; GCN-HSA-NEXT: v_mov_b32_e32 v9, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[14:17] ; GCN-HSA-NEXT: v_bfe_u32 v10, v2, 8, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GCN-HSA-NEXT: v_bfe_u32 v4, v2, 16, 8 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, v5 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v16, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_bfe_u32 v13, v3, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v3 ; GCN-HSA-NEXT: v_bfe_u32 v9, v0, 8, 8 ; GCN-HSA-NEXT: v_bfe_u32 v17, v1, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xff, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[11:14] @@ -7070,64 +7071,64 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCN-HSA-NEXT: s_endpgm @@ -7596,71 +7597,71 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, v1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v4 ; GCN-HSA-NEXT: v_bfe_u32 v10, v4, 16, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v3 ; GCN-HSA-NEXT: v_bfe_u32 v10, v3, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v2 ; GCN-HSA-NEXT: v_bfe_u32 v10, v2, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v9 ; GCN-HSA-NEXT: v_bfe_u32 v10, v9, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v8 ; GCN-HSA-NEXT: v_bfe_u32 v10, v8, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v7 ; GCN-HSA-NEXT: v_bfe_u32 v10, v7, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v6 ; GCN-HSA-NEXT: v_bfe_u32 v10, v6, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_bfe_u32 v12, v5, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_bfe_u32 v12, v4, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v4 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 @@ -7669,48 +7670,48 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: v_bfe_u32 v12, v3, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: v_bfe_u32 v12, v2, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[10:13] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[10:13] ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GCN-HSA-NEXT: v_bfe_u32 v11, v9, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff, v9 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[9:12] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 24, v5 ; GCN-HSA-NEXT: v_bfe_u32 v0, v5, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_bfe_u32 v16, v8, 8, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_bfe_u32 v16, v8, 8, 8 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x80 ; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, v1 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x80 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[14:17] -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_bfe_u32 v9, v7, 8, 8 ; GCN-HSA-NEXT: v_bfe_u32 v13, v6, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v6 ; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[14:17] +; GCN-HSA-NEXT: v_mov_b32_e32 v8, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[7:10] ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] @@ -8439,12 +8440,12 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_add_u32 s54, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s55 ; GCN-HSA-NEXT: s_add_u32 s54, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s55 ; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s55 ; GCN-HSA-NEXT: s_add_u32 s54, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s55 ; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40 ; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xd0 @@ -8453,9 +8454,9 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s54 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s41 ; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xc0 ; GCN-HSA-NEXT: v_mov_b32_e32 v29, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s41 ; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] @@ -8464,11 +8465,11 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] @@ -8483,29 +8484,30 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_addc_u32 s25, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s18 ; GCN-HSA-NEXT: s_add_u32 s18, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s49 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 ; GCN-HSA-NEXT: s_addc_u32 s19, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s19 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] @@ -8514,9 +8516,9 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_add_u32 s16, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -8525,21 +8527,21 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xf0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xe0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xe0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 @@ -8552,10 +8554,10 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xb0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0xa0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -10834,8 +10836,8 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v0 ; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v1 @@ -10871,11 +10873,11 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff00ff ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s6 @@ -11316,8 +11318,8 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 ; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3 @@ -11365,11 +11367,11 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_or_b32 s2, s2, s17 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 @@ -12004,24 +12006,25 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_endpgm @@ -12815,8 +12818,8 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -12916,22 +12919,22 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll index 3fedd68edaea2..ed36a3d0036c8 100644 --- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s + ; Combine on select c, (load x), (load y) -> load (select c, x, y) ; drops MachinePointerInfo, so it can't be relied on for correctness. @@ -21,16 +22,16 @@ define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], ptr %p ; GCN-NEXT: s_cselect_b32 s2, s4, s5 ; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_add_u32 s2, s2, 4 -; GCN-NEXT: flat_load_dword v0, v[0:1] +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_addc_u32 s3, s3, 0 +; GCN-NEXT: flat_load_dword v0, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: flat_load_dword v1, v[1:2] -; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -62,8 +63,8 @@ define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], ptr ; GCN-NEXT: s_cselect_b32 s3, s3, s6 ; GCN-NEXT: s_cselect_b32 s2, s4, s5 ; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 @@ -93,8 +94,8 @@ define amdgpu_kernel void @select_ptr_crash_i64_local(i32 %tmp, ptr addrspace(3) ; GCN-NEXT: s_cselect_b32 s0, s1, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b64 v[0:1], v0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -125,8 +126,8 @@ define amdgpu_kernel void @select_ptr_crash_i64_local_offsets(i32 %tmp, ptr addr ; GCN-NEXT: s_cselect_b32 s0, s1, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b64 v[0:1], v0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -140,81 +141,3 @@ define amdgpu_kernel void @select_ptr_crash_i64_local_offsets(i32 %tmp, ptr addr store i64 %tmp5, ptr addrspace(1) %ptr2, align 8 ret void } - -; The resultant load cannot be treated as uniform -define amdgpu_kernel void @sample_test(ptr addrspace(1) %dest, ptr addrspace(1) %sourceA, ptr addrspace(1) %sourceB, i1 %tobool.not.i) #0 { -; GCN-LABEL: sample_test: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[4:5], 0x18 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp1_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NEXT: v_add_u32_e32 v3, vcc, s2, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; GCN-NEXT: s_endpgm -entry: - %0 = tail call i32 @llvm.amdgcn.workitem.id.x() - %conv2.i.i.i1 = zext i32 %0 to i64 - %arrayidx.i = getelementptr i64, ptr addrspace(1) %sourceA, i64 %conv2.i.i.i1 - %dest.gep = getelementptr i64, ptr addrspace(1) %dest, i64 %conv2.i.i.i1 - %ld0 = load i64, ptr addrspace(1) %arrayidx.i, align 8, !amdgpu.noclobber !0 - %ld1 = load i64, ptr addrspace(1) %sourceB, align 8 - %cond.i = select i1 %tobool.not.i, i64 %ld0, i64 %ld1 - store i64 %cond.i, ptr addrspace(1) %dest.gep, align 8 - ret void -} - -; The resultant load cannot be treated as uniform -define amdgpu_kernel void @constant_is_not_uniform(ptr addrspace(1) %dest, ptr addrspace(4) %sourceA, ptr addrspace(4) %sourceB, i1 %tobool.not.i) #0 { -; GCN-LABEL: constant_is_not_uniform: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[4:5], 0x18 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp1_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NEXT: v_add_u32_e32 v3, vcc, s2, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; GCN-NEXT: s_endpgm -entry: - %0 = tail call i32 @llvm.amdgcn.workitem.id.x() - %conv2.i.i.i1 = zext i32 %0 to i64 - %arrayidx.i = getelementptr i64, ptr addrspace(4) %sourceA, i64 %conv2.i.i.i1 - %dest.gep = getelementptr i64, ptr addrspace(1) %dest, i64 %conv2.i.i.i1 - %ld0 = load i64, ptr addrspace(4) %arrayidx.i, align 8 - %ld1 = load i64, ptr addrspace(4) %sourceB, align 8 - %cond.i = select i1 %tobool.not.i, i64 %ld0, i64 %ld1 - store i64 %cond.i, ptr addrspace(1) %dest.gep, align 8 - ret void -} - -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } - -!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll index 09225709a1acf..11bade44eb909 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll @@ -81,13 +81,13 @@ define amdgpu_kernel void @caller() { ; GFX9-SDAG-NEXT: s_add_u32 s4, s4, callee@gotpcrel32@lo+4 ; GFX9-SDAG-NEXT: s_addc_u32 s5, s5, callee@gotpcrel32@hi+12 ; GFX9-SDAG-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x0 -; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-SDAG-NEXT: s_mov_b32 s32, 0 @@ -111,13 +111,13 @@ define amdgpu_kernel void @caller() { ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s14 +; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], s[12:13] ; GFX9-GISEL-NEXT: s_mov_b32 s12, s14 @@ -140,13 +140,13 @@ define amdgpu_kernel void @caller() { ; GFX9ARCH-SDAG-NEXT: s_add_u32 s4, s4, callee@gotpcrel32@lo+4 ; GFX9ARCH-SDAG-NEXT: s_addc_u32 s5, s5, callee@gotpcrel32@hi+12 ; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 -; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9ARCH-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9ARCH-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX9ARCH-SDAG-NEXT: s_mov_b32 s32, 0 @@ -169,13 +169,13 @@ define amdgpu_kernel void @caller() { ; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 ; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 ; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x0 -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9ARCH-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[4:5], s[12:13] ; GFX9ARCH-GISEL-NEXT: s_mov_b32 s32, 0 diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll index 2f3ca8b795f7d..044303ac3d67d 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll @@ -104,11 +104,11 @@ define amdgpu_cs void @_amdgpu_cs_main() { ; GFX1250-GISEL-NEXT: s_add_co_i32 s3, s3, 1 ; GFX1250-GISEL-NEXT: s_bfe_u32 s5, ttmp6, 0x40008 ; GFX1250-GISEL-NEXT: s_mul_i32 s3, s4, s3 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX1250-GISEL-NEXT: s_add_co_i32 s5, s5, s3 ; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s2, 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s4, s5 -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX1250-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null ; GFX1250-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index fc8467cb73ab6..228d7a397751d 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -1093,8 +1093,9 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; GFX11-NEXT: s_mul_hi_u32 s3, s2, s3 ; GFX11-NEXT: s_add_u32 s2, s6, s4 ; GFX11-NEXT: s_addc_u32 s3, s3, s5 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -1111,8 +1112,9 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_mul_u64 s[2:3], s[6:7], s[2:3] ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll index af713179a888d..607c3cbfce616 100644 --- a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll +++ b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll @@ -93,12 +93,12 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1f @@ -176,12 +176,12 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1f diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll index faf70f55876f7..de2135a3bfd74 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll @@ -165,9 +165,9 @@ define void @issue63986_reduced_expanded(i64 %idxprom) { ; CHECK-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; CHECK-NEXT: s_cbranch_execnz .LBB1_8 ; CHECK-NEXT: .LBB1_5: ; %loop-memcpy-residual.preheader -; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: s_mov_b64 s[8:9], 0 ; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: .LBB1_6: ; %loop-memcpy-residual ; CHECK-NEXT: s_add_i32 s6, s8, 1 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 5b7c36559a366..81dcb7aaad545 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -94,12 +94,11 @@ entry: define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 { ; CHECK-LABEL: memcpy_p5_p4_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 -; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v24, 0 -; CHECK-NEXT: s_add_u32 s20, s20, s17 +; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] +; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 @@ -107,6 +106,7 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 ; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 ; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 +; CHECK-NEXT: s_add_u32 s20, s20, s17 ; CHECK-NEXT: s_addc_u32 s21, s21, 0 ; CHECK-NEXT: v_mov_b32_e32 v25, s2 ; CHECK-NEXT: s_waitcnt vmcnt(5) @@ -160,11 +160,11 @@ entry: define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 { ; CHECK-LABEL: memcpy_p0_p5_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_add_u32 s20, s20, s17 ; CHECK-NEXT: s_addc_u32 s21, s21, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -382,12 +382,11 @@ entry: define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 { ; CHECK-LABEL: memcpy_p5_p4_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 -; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v24, 0 -; CHECK-NEXT: s_add_u32 s20, s20, s17 +; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] +; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 @@ -395,6 +394,7 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 ; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 ; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 +; CHECK-NEXT: s_add_u32 s20, s20, s17 ; CHECK-NEXT: s_addc_u32 s21, s21, 0 ; CHECK-NEXT: v_mov_b32_e32 v25, s2 ; CHECK-NEXT: s_waitcnt vmcnt(5) @@ -448,11 +448,11 @@ entry: define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 { ; CHECK-LABEL: memcpy_p0_p5_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_add_u32 s20, s20, s17 ; CHECK-NEXT: s_addc_u32 s21, s21, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll index d95965caa81ab..c936a73c07a56 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 %s -o - | FileCheck %s + ; Check code generation for memmoves with statically unknown size and all ; combinations of the following address spaces: ; destination address space: 0, 1, 3, 5 @@ -32,10 +33,10 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_cbranch_execz .LBB0_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 -; CHECK-NEXT: v_mov_b32_e32 v11, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: v_mov_b32_e32 v11, v1 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 +; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 @@ -129,8 +130,8 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo ; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v10, -16 @@ -180,10 +181,10 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: s_cbranch_execz .LBB1_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 -; CHECK-NEXT: v_mov_b32_e32 v11, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: v_mov_b32_e32 v11, v1 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 +; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 @@ -277,8 +278,8 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB1_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo ; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v10, -16 @@ -330,8 +331,8 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_cbranch_execz .LBB2_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v10, v1 -; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: v_mov_b32_e32 v9, v0 +; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: v_mov_b32_e32 v11, v5 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: s_mov_b32 s9, 0 @@ -472,10 +473,10 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: s_cbranch_execz .LBB3_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 -; CHECK-NEXT: v_mov_b32_e32 v11, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: v_mov_b32_e32 v11, v1 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 +; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 @@ -569,8 +570,8 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB3_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo ; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v10, -16 @@ -622,8 +623,8 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: s_cbranch_execz .LBB4_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v10, v1 -; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: v_mov_b32_e32 v9, v0 +; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: v_mov_b32_e32 v11, v5 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: s_mov_b32 s9, 0 @@ -772,10 +773,10 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_cbranch_execz .LBB5_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 -; CHECK-NEXT: v_mov_b32_e32 v11, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: v_mov_b32_e32 v11, v1 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 +; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 @@ -869,8 +870,8 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB5_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo ; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v10, -16 @@ -918,10 +919,10 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: s_cbranch_execz .LBB6_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 -; CHECK-NEXT: v_mov_b32_e32 v11, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: v_mov_b32_e32 v11, v1 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 +; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 @@ -1015,8 +1016,8 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB6_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo ; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v10, -16 @@ -1127,10 +1128,10 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: s_cbranch_execz .LBB8_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 -; CHECK-NEXT: v_mov_b32_e32 v11, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: v_mov_b32_e32 v11, v1 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 +; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 @@ -1224,8 +1225,8 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB8_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo ; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v10, -16 @@ -1347,8 +1348,8 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_cbranch_execz .LBB10_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v10, v2 -; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: v_mov_b32_e32 v9, v1 +; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: v_mov_b32_e32 v11, v7 ; CHECK-NEXT: v_mov_b32_e32 v4, v0 ; CHECK-NEXT: s_mov_b32 s9, 0 @@ -1813,8 +1814,8 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_cbranch_execz .LBB15_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v10, v2 -; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: v_mov_b32_e32 v9, v1 +; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: v_mov_b32_e32 v11, v7 ; CHECK-NEXT: v_mov_b32_e32 v4, v0 ; CHECK-NEXT: s_mov_b32 s8, 0 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index e6fd6aba92cf0..f52bcf52f9f3b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -388,14 +388,14 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000 +; GCN-NEXT: s_brev_b32 s0, 1 ; GCN-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_brev_b32 s0, 1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: s_mov_b32 s3, 0 ; GCN-NEXT: s_mov_b32 s1, s0 ; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll index 9585c486aeb9e..0057c4da1196d 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll @@ -308,40 +308,47 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr ; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 ; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 ; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GFX908-NEXT: s_waitcnt vmcnt(7) +; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX908-NEXT: s_waitcnt vmcnt(6) +; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX908-NEXT: s_waitcnt vmcnt(5) +; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX908-NEXT: s_waitcnt vmcnt(4) +; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX908-NEXT: s_waitcnt vmcnt(3) +; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX908-NEXT: s_waitcnt vmcnt(2) +; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX908-NEXT: s_waitcnt vmcnt(1) +; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX908-NEXT: v_accvgpr_write_b32 a5, v5 ; GFX908-NEXT: v_accvgpr_write_b32 a6, v6 ; GFX908-NEXT: v_accvgpr_write_b32 a7, v7 -; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 ; GFX908-NEXT: v_accvgpr_write_b32 a9, v9 ; GFX908-NEXT: v_accvgpr_write_b32 a10, v10 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v11 -; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 ; GFX908-NEXT: v_accvgpr_write_b32 a13, v13 ; GFX908-NEXT: v_accvgpr_write_b32 a14, v14 ; GFX908-NEXT: v_accvgpr_write_b32 a15, v15 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 ; GFX908-NEXT: v_accvgpr_write_b32 a17, v17 ; GFX908-NEXT: v_accvgpr_write_b32 a18, v18 ; GFX908-NEXT: v_accvgpr_write_b32 a19, v19 -; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 ; GFX908-NEXT: v_accvgpr_write_b32 a21, v21 ; GFX908-NEXT: v_accvgpr_write_b32 a22, v22 ; GFX908-NEXT: v_accvgpr_write_b32 a23, v23 -; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 ; GFX908-NEXT: v_accvgpr_write_b32 a25, v25 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v26 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v27 -; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] @@ -414,40 +421,47 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr add ; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 ; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 ; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GFX908-NEXT: s_waitcnt vmcnt(7) +; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX908-NEXT: s_waitcnt vmcnt(6) +; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX908-NEXT: s_waitcnt vmcnt(5) +; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX908-NEXT: s_waitcnt vmcnt(4) +; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX908-NEXT: s_waitcnt vmcnt(3) +; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX908-NEXT: s_waitcnt vmcnt(2) +; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX908-NEXT: s_waitcnt vmcnt(1) +; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX908-NEXT: v_accvgpr_write_b32 a5, v5 ; GFX908-NEXT: v_accvgpr_write_b32 a6, v6 ; GFX908-NEXT: v_accvgpr_write_b32 a7, v7 -; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 ; GFX908-NEXT: v_accvgpr_write_b32 a9, v9 ; GFX908-NEXT: v_accvgpr_write_b32 a10, v10 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v11 -; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 ; GFX908-NEXT: v_accvgpr_write_b32 a13, v13 ; GFX908-NEXT: v_accvgpr_write_b32 a14, v14 ; GFX908-NEXT: v_accvgpr_write_b32 a15, v15 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 ; GFX908-NEXT: v_accvgpr_write_b32 a17, v17 ; GFX908-NEXT: v_accvgpr_write_b32 a18, v18 ; GFX908-NEXT: v_accvgpr_write_b32 a19, v19 -; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 ; GFX908-NEXT: v_accvgpr_write_b32 a21, v21 ; GFX908-NEXT: v_accvgpr_write_b32 a22, v22 ; GFX908-NEXT: v_accvgpr_write_b32 a23, v23 -; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 ; GFX908-NEXT: v_accvgpr_write_b32 a25, v25 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v26 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v27 -; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] @@ -520,40 +534,47 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addr ; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 ; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 ; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GFX908-NEXT: s_waitcnt vmcnt(7) +; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX908-NEXT: s_waitcnt vmcnt(6) +; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX908-NEXT: s_waitcnt vmcnt(5) +; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX908-NEXT: s_waitcnt vmcnt(4) +; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX908-NEXT: s_waitcnt vmcnt(3) +; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX908-NEXT: s_waitcnt vmcnt(2) +; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX908-NEXT: s_waitcnt vmcnt(1) +; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX908-NEXT: v_accvgpr_write_b32 a5, v5 ; GFX908-NEXT: v_accvgpr_write_b32 a6, v6 ; GFX908-NEXT: v_accvgpr_write_b32 a7, v7 -; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 ; GFX908-NEXT: v_accvgpr_write_b32 a9, v9 ; GFX908-NEXT: v_accvgpr_write_b32 a10, v10 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v11 -; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 ; GFX908-NEXT: v_accvgpr_write_b32 a13, v13 ; GFX908-NEXT: v_accvgpr_write_b32 a14, v14 ; GFX908-NEXT: v_accvgpr_write_b32 a15, v15 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 ; GFX908-NEXT: v_accvgpr_write_b32 a17, v17 ; GFX908-NEXT: v_accvgpr_write_b32 a18, v18 ; GFX908-NEXT: v_accvgpr_write_b32 a19, v19 -; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 ; GFX908-NEXT: v_accvgpr_write_b32 a21, v21 ; GFX908-NEXT: v_accvgpr_write_b32 a22, v22 ; GFX908-NEXT: v_accvgpr_write_b32 a23, v23 -; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 ; GFX908-NEXT: v_accvgpr_write_b32 a25, v25 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v26 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v27 -; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] @@ -627,14 +648,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) ; GFX908-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 ; GFX908-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 ; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX908-NEXT: s_mov_b32 s14, s10 -; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX908-NEXT: s_mov_b32 s14, s10 +; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: v_mov_b32_e32 v40, 0 @@ -648,40 +669,47 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) ; GFX908-NEXT: global_load_dwordx4 v[8:11], v40, s[34:35] offset:32 ; GFX908-NEXT: global_load_dwordx4 v[4:7], v40, s[34:35] offset:16 ; GFX908-NEXT: global_load_dwordx4 v[0:3], v40, s[34:35] +; GFX908-NEXT: s_waitcnt vmcnt(7) +; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX908-NEXT: s_waitcnt vmcnt(6) +; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX908-NEXT: s_waitcnt vmcnt(5) +; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX908-NEXT: s_waitcnt vmcnt(4) +; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX908-NEXT: s_waitcnt vmcnt(3) +; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX908-NEXT: s_waitcnt vmcnt(2) +; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX908-NEXT: s_waitcnt vmcnt(1) +; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX908-NEXT: v_accvgpr_write_b32 a5, v5 ; GFX908-NEXT: v_accvgpr_write_b32 a6, v6 ; GFX908-NEXT: v_accvgpr_write_b32 a7, v7 -; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 ; GFX908-NEXT: v_accvgpr_write_b32 a9, v9 ; GFX908-NEXT: v_accvgpr_write_b32 a10, v10 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v11 -; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 ; GFX908-NEXT: v_accvgpr_write_b32 a13, v13 ; GFX908-NEXT: v_accvgpr_write_b32 a14, v14 ; GFX908-NEXT: v_accvgpr_write_b32 a15, v15 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 ; GFX908-NEXT: v_accvgpr_write_b32 a17, v17 ; GFX908-NEXT: v_accvgpr_write_b32 a18, v18 ; GFX908-NEXT: v_accvgpr_write_b32 a19, v19 -; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 ; GFX908-NEXT: v_accvgpr_write_b32 a21, v21 ; GFX908-NEXT: v_accvgpr_write_b32 a22, v22 ; GFX908-NEXT: v_accvgpr_write_b32 a23, v23 -; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 ; GFX908-NEXT: v_accvgpr_write_b32 a25, v25 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v26 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v27 -; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] @@ -897,10 +925,10 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace( ; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -933,40 +961,47 @@ define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0 ; GFX908-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 ; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 ; GFX908-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX908-NEXT: s_waitcnt vmcnt(7) +; GFX908-NEXT: v_accvgpr_write_b32 a28, v30 +; GFX908-NEXT: s_waitcnt vmcnt(6) +; GFX908-NEXT: v_accvgpr_write_b32 a24, v26 +; GFX908-NEXT: s_waitcnt vmcnt(5) +; GFX908-NEXT: v_accvgpr_write_b32 a20, v22 +; GFX908-NEXT: s_waitcnt vmcnt(4) +; GFX908-NEXT: v_accvgpr_write_b32 a16, v18 +; GFX908-NEXT: s_waitcnt vmcnt(3) +; GFX908-NEXT: v_accvgpr_write_b32 a12, v14 +; GFX908-NEXT: s_waitcnt vmcnt(2) +; GFX908-NEXT: v_accvgpr_write_b32 a8, v10 +; GFX908-NEXT: s_waitcnt vmcnt(1) +; GFX908-NEXT: v_accvgpr_write_b32 a4, v6 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v4 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v5 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v6 ; GFX908-NEXT: v_accvgpr_write_b32 a5, v7 ; GFX908-NEXT: v_accvgpr_write_b32 a6, v8 ; GFX908-NEXT: v_accvgpr_write_b32 a7, v9 -; GFX908-NEXT: v_accvgpr_write_b32 a8, v10 ; GFX908-NEXT: v_accvgpr_write_b32 a9, v11 ; GFX908-NEXT: v_accvgpr_write_b32 a10, v12 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v13 -; GFX908-NEXT: v_accvgpr_write_b32 a12, v14 ; GFX908-NEXT: v_accvgpr_write_b32 a13, v15 ; GFX908-NEXT: v_accvgpr_write_b32 a14, v16 ; GFX908-NEXT: v_accvgpr_write_b32 a15, v17 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v18 ; GFX908-NEXT: v_accvgpr_write_b32 a17, v19 ; GFX908-NEXT: v_accvgpr_write_b32 a18, v20 ; GFX908-NEXT: v_accvgpr_write_b32 a19, v21 -; GFX908-NEXT: v_accvgpr_write_b32 a20, v22 ; GFX908-NEXT: v_accvgpr_write_b32 a21, v23 ; GFX908-NEXT: v_accvgpr_write_b32 a22, v24 ; GFX908-NEXT: v_accvgpr_write_b32 a23, v25 -; GFX908-NEXT: v_accvgpr_write_b32 a24, v26 ; GFX908-NEXT: v_accvgpr_write_b32 a25, v27 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v28 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v29 -; GFX908-NEXT: v_accvgpr_write_b32 a28, v30 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v31 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v32 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v33 -; GFX908-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX908-NEXT: v_mov_b32_e32 v3, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] @@ -1034,40 +1069,47 @@ define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg) ; GFX908-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 ; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 ; GFX908-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX908-NEXT: s_waitcnt vmcnt(7) +; GFX908-NEXT: v_accvgpr_write_b32 a28, v30 +; GFX908-NEXT: s_waitcnt vmcnt(6) +; GFX908-NEXT: v_accvgpr_write_b32 a24, v26 +; GFX908-NEXT: s_waitcnt vmcnt(5) +; GFX908-NEXT: v_accvgpr_write_b32 a20, v22 +; GFX908-NEXT: s_waitcnt vmcnt(4) +; GFX908-NEXT: v_accvgpr_write_b32 a16, v18 +; GFX908-NEXT: s_waitcnt vmcnt(3) +; GFX908-NEXT: v_accvgpr_write_b32 a12, v14 +; GFX908-NEXT: s_waitcnt vmcnt(2) +; GFX908-NEXT: v_accvgpr_write_b32 a8, v10 +; GFX908-NEXT: s_waitcnt vmcnt(1) +; GFX908-NEXT: v_accvgpr_write_b32 a4, v6 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v4 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v5 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v6 ; GFX908-NEXT: v_accvgpr_write_b32 a5, v7 ; GFX908-NEXT: v_accvgpr_write_b32 a6, v8 ; GFX908-NEXT: v_accvgpr_write_b32 a7, v9 -; GFX908-NEXT: v_accvgpr_write_b32 a8, v10 ; GFX908-NEXT: v_accvgpr_write_b32 a9, v11 ; GFX908-NEXT: v_accvgpr_write_b32 a10, v12 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v13 -; GFX908-NEXT: v_accvgpr_write_b32 a12, v14 ; GFX908-NEXT: v_accvgpr_write_b32 a13, v15 ; GFX908-NEXT: v_accvgpr_write_b32 a14, v16 ; GFX908-NEXT: v_accvgpr_write_b32 a15, v17 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v18 ; GFX908-NEXT: v_accvgpr_write_b32 a17, v19 ; GFX908-NEXT: v_accvgpr_write_b32 a18, v20 ; GFX908-NEXT: v_accvgpr_write_b32 a19, v21 -; GFX908-NEXT: v_accvgpr_write_b32 a20, v22 ; GFX908-NEXT: v_accvgpr_write_b32 a21, v23 ; GFX908-NEXT: v_accvgpr_write_b32 a22, v24 ; GFX908-NEXT: v_accvgpr_write_b32 a23, v25 -; GFX908-NEXT: v_accvgpr_write_b32 a24, v26 ; GFX908-NEXT: v_accvgpr_write_b32 a25, v27 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v28 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v29 -; GFX908-NEXT: v_accvgpr_write_b32 a28, v30 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v31 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v32 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v33 -; GFX908-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX908-NEXT: v_mov_b32_e32 v3, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index 8b6bb9b8c5fcd..b3d6ac4c7e9ca 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -4,6 +4,7 @@ ; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s + ; Check that we do not copy agprs to vgprs and back inside the loop. ; Final result should be read only once after the loop. @@ -55,12 +56,12 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -414,12 +415,12 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -767,12 +768,12 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -1157,12 +1158,12 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -1510,12 +1511,12 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -1902,12 +1903,12 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -2295,12 +2296,12 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -2658,12 +2659,12 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -2863,12 +2864,12 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -3272,12 +3273,12 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX908-NEXT: ; %bb.4: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 10 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -3479,779 +3480,6 @@ exit: ret void } -; Phi exit use is vgpr abi use -define <32 x float> @test_mfma_loop_zeroinit_ret_use() #0 { -; GFX908-LABEL: test_mfma_loop_zeroinit_ret_use: -; GFX908: ; %bb.0: ; %entry -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX908-NEXT: s_mov_b32 s4, 16 -; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX908-NEXT: .LBB10_1: ; %for.cond.preheader -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] -; GFX908-NEXT: s_add_i32 s4, s4, -1 -; GFX908-NEXT: s_cmp_lg_u32 s4, 0 -; GFX908-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX908-NEXT: ; %bb.2: ; %exit -; GFX908-NEXT: s_nop 14 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: test_mfma_loop_zeroinit_ret_use: -; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v11, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v13, 0 -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v15, 0 -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v17, 0 -; GFX90A-NEXT: v_mov_b32_e32 v18, 0 -; GFX90A-NEXT: v_mov_b32_e32 v19, 0 -; GFX90A-NEXT: v_mov_b32_e32 v20, 0 -; GFX90A-NEXT: v_mov_b32_e32 v21, 0 -; GFX90A-NEXT: v_mov_b32_e32 v22, 0 -; GFX90A-NEXT: v_mov_b32_e32 v23, 0 -; GFX90A-NEXT: v_mov_b32_e32 v24, 0 -; GFX90A-NEXT: v_mov_b32_e32 v25, 0 -; GFX90A-NEXT: v_mov_b32_e32 v26, 0 -; GFX90A-NEXT: v_mov_b32_e32 v27, 0 -; GFX90A-NEXT: v_mov_b32_e32 v28, 0 -; GFX90A-NEXT: v_mov_b32_e32 v29, 0 -; GFX90A-NEXT: v_mov_b32_e32 v30, 0 -; GFX90A-NEXT: v_mov_b32_e32 v31, 0 -; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 -; GFX90A-NEXT: .LBB10_1: ; %for.cond.preheader -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: s_add_i32 s4, s4, -1 -; GFX90A-NEXT: s_cmp_lg_u32 s4, 0 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] -; GFX90A-NEXT: s_nop 15 -; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX90A-NEXT: ; %bb.2: ; %exit -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: test_mfma_loop_zeroinit_ret_use: -; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v9, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v11, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v13, 0 -; GFX942-NEXT: v_mov_b32_e32 v14, 0 -; GFX942-NEXT: v_mov_b32_e32 v15, 0 -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v17, 0 -; GFX942-NEXT: v_mov_b32_e32 v18, 0 -; GFX942-NEXT: v_mov_b32_e32 v19, 0 -; GFX942-NEXT: v_mov_b32_e32 v20, 0 -; GFX942-NEXT: v_mov_b32_e32 v21, 0 -; GFX942-NEXT: v_mov_b32_e32 v22, 0 -; GFX942-NEXT: v_mov_b32_e32 v23, 0 -; GFX942-NEXT: v_mov_b32_e32 v24, 0 -; GFX942-NEXT: v_mov_b32_e32 v25, 0 -; GFX942-NEXT: v_mov_b32_e32 v26, 0 -; GFX942-NEXT: v_mov_b32_e32 v27, 0 -; GFX942-NEXT: v_mov_b32_e32 v28, 0 -; GFX942-NEXT: v_mov_b32_e32 v29, 0 -; GFX942-NEXT: v_mov_b32_e32 v30, 0 -; GFX942-NEXT: v_mov_b32_e32 v31, 0 -; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 -; GFX942-NEXT: .LBB10_1: ; %for.cond.preheader -; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: s_add_i32 s0, s0, -1 -; GFX942-NEXT: s_cmp_lg_u32 s0, 0 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] -; GFX942-NEXT: s_nop 15 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX942-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX942-NEXT: ; %bb.2: ; %exit -; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX942-NEXT: s_setpc_b64 s[30:31] -entry: - br label %for.cond.preheader - -for.cond.preheader: - %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ] - %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] - %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) - %inc = add nuw nsw i32 %c, 1 - %cc = icmp eq i32 %inc, 16 - br i1 %cc, label %exit, label %for.cond.preheader - -exit: - ret <32 x float> %mai.1 -} - -define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { -; GFX908-LABEL: test_mfma_loop_non_splat_ret_use: -; GFX908: ; %bb.0: ; %entry -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_accvgpr_write_b32 a1, 1.0 -; GFX908-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX908-NEXT: s_mov_b32 s4, 16 -; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX908-NEXT: .LBB11_1: ; %for.cond.preheader -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GFX908-NEXT: s_add_i32 s4, s4, -1 -; GFX908-NEXT: s_cmp_lg_u32 s4, 0 -; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 -; GFX908-NEXT: ; %bb.2: ; %exit -; GFX908-NEXT: s_nop 14 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: test_mfma_loop_non_splat_ret_use: -; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v11, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v13, 0 -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v15, 0 -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v17, 0 -; GFX90A-NEXT: v_mov_b32_e32 v18, 0 -; GFX90A-NEXT: v_mov_b32_e32 v19, 0 -; GFX90A-NEXT: v_mov_b32_e32 v20, 0 -; GFX90A-NEXT: v_mov_b32_e32 v21, 0 -; GFX90A-NEXT: v_mov_b32_e32 v22, 0 -; GFX90A-NEXT: v_mov_b32_e32 v23, 0 -; GFX90A-NEXT: v_mov_b32_e32 v24, 0 -; GFX90A-NEXT: v_mov_b32_e32 v25, 0 -; GFX90A-NEXT: v_mov_b32_e32 v26, 0 -; GFX90A-NEXT: v_mov_b32_e32 v27, 0 -; GFX90A-NEXT: v_mov_b32_e32 v28, 0 -; GFX90A-NEXT: v_mov_b32_e32 v29, 0 -; GFX90A-NEXT: v_mov_b32_e32 v30, 0 -; GFX90A-NEXT: v_mov_b32_e32 v31, 0 -; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 -; GFX90A-NEXT: .LBB11_1: ; %for.cond.preheader -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: s_add_i32 s4, s4, -1 -; GFX90A-NEXT: s_cmp_lg_u32 s4, 0 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] -; GFX90A-NEXT: s_nop 15 -; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 -; GFX90A-NEXT: ; %bb.2: ; %exit -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: test_mfma_loop_non_splat_ret_use: -; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v9, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v11, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v13, 0 -; GFX942-NEXT: v_mov_b32_e32 v14, 0 -; GFX942-NEXT: v_mov_b32_e32 v15, 0 -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v17, 0 -; GFX942-NEXT: v_mov_b32_e32 v18, 0 -; GFX942-NEXT: v_mov_b32_e32 v19, 0 -; GFX942-NEXT: v_mov_b32_e32 v20, 0 -; GFX942-NEXT: v_mov_b32_e32 v21, 0 -; GFX942-NEXT: v_mov_b32_e32 v22, 0 -; GFX942-NEXT: v_mov_b32_e32 v23, 0 -; GFX942-NEXT: v_mov_b32_e32 v24, 0 -; GFX942-NEXT: v_mov_b32_e32 v25, 0 -; GFX942-NEXT: v_mov_b32_e32 v26, 0 -; GFX942-NEXT: v_mov_b32_e32 v27, 0 -; GFX942-NEXT: v_mov_b32_e32 v28, 0 -; GFX942-NEXT: v_mov_b32_e32 v29, 0 -; GFX942-NEXT: v_mov_b32_e32 v30, 0 -; GFX942-NEXT: v_mov_b32_e32 v31, 0 -; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 -; GFX942-NEXT: .LBB11_1: ; %for.cond.preheader -; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: s_add_i32 s0, s0, -1 -; GFX942-NEXT: s_cmp_lg_u32 s0, 0 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] -; GFX942-NEXT: s_nop 15 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX942-NEXT: s_cbranch_scc1 .LBB11_1 -; GFX942-NEXT: ; %bb.2: ; %exit -; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX942-NEXT: s_setpc_b64 s[30:31] -entry: - br label %for.cond.preheader - -for.cond.preheader: - %phi = phi <32 x float> [ , %entry ], [ %mai.1, %for.cond.preheader ] - %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] - %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) - %inc = add nuw nsw i32 %c, 1 - %cc = icmp eq i32 %inc, 16 - br i1 %cc, label %exit, label %for.cond.preheader - -exit: - ret <32 x float> %mai.1 -} - declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll index c8cc40faf1e84..f9295e42f55c7 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -408,7 +408,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-GISEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 ; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a16, s16 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 @@ -424,6 +423,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a16, s16 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a17, s17 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a18, s18 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a19, s19 diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index eff0680fe9a31..0aa665594086a 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -357,11 +357,11 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; CI-NEXT: s_min_i32 s2, s2, s6 ; CI-NEXT: s_min_i32 s1, s1, s5 ; CI-NEXT: s_min_i32 s0, s0, s4 -; CI-NEXT: v_mov_b32_e32 v4, s8 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_mov_b32_e32 v4, s8 ; CI-NEXT: v_mov_b32_e32 v5, s9 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm @@ -378,11 +378,11 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; VI-NEXT: s_min_i32 s2, s2, s6 ; VI-NEXT: s_min_i32 s1, s1, s5 ; VI-NEXT: s_min_i32 s0, s0, s4 -; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v5, s9 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -506,8 +506,8 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: s_sext_i32_i8 s3, s3 ; CI-NEXT: s_min_i32 s2, s2, s3 -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_byte v[0:1], v2 @@ -524,8 +524,8 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: s_sext_i32_i8 s3, s3 ; VI-NEXT: s_min_i32 s2, s3, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -682,8 +682,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; CI-NEXT: s_or_b32 s3, s4, s3 ; CI-NEXT: s_and_b32 s3, s3, 0xffff ; CI-NEXT: s_or_b32 s2, s3, s2 -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -720,8 +720,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; VI-NEXT: s_lshl_b32 s4, s4, 16 ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_or_b32 s2, s2, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1098,9 +1098,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; CI-NEXT: s_and_b32 s0, s0, 0xffff ; CI-NEXT: s_or_b32 s1, s1, s7 ; CI-NEXT: s_or_b32 s0, s0, s3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1131,9 +1131,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; VI-NEXT: s_lshl_b32 s3, s3, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: s_or_b32 s0, s0, s3 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1644,9 +1644,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s1, s1, s3 ; CI-NEXT: s_min_i32 s0, s0, s2 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1661,9 +1661,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s1, s1, s3 ; VI-NEXT: s_min_i32 s0, s0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -3430,9 +3430,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: v_mov_b32_e32 v2, s3 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_mov_b32_e32 v0, s9 ; CI-NEXT: v_mov_b32_e32 v1, s8 +; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: v_mov_b32_e32 v5, s1 @@ -3464,9 +3464,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -3774,11 +3774,11 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; CI-NEXT: s_or_b32 s2, s2, s6 ; CI-NEXT: s_or_b32 s1, s1, s5 ; CI-NEXT: s_or_b32 s0, s0, s4 -; CI-NEXT: v_mov_b32_e32 v4, s8 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_mov_b32_e32 v4, s8 ; CI-NEXT: v_mov_b32_e32 v5, s9 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm @@ -3823,11 +3823,11 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; VI-NEXT: s_min_u32 s5, s6, s5 ; VI-NEXT: s_lshl_b32 s0, s0, 16 ; VI-NEXT: s_or_b32 s0, s5, s0 -; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v5, s9 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -3932,8 +3932,8 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; CI-NEXT: s_and_b32 s2, s2, 0xffff ; CI-NEXT: s_and_b32 s3, s3, 0xffff ; CI-NEXT: s_min_u32 s2, s2, s3 -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -3950,8 +3950,8 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_and_b32 s3, s3, 0xffff ; VI-NEXT: s_min_u32 s2, s2, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -4062,8 +4062,8 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; CI-NEXT: s_sext_i32_i16 s2, s2 ; CI-NEXT: s_sext_i32_i16 s3, s3 ; CI-NEXT: s_min_i32 s2, s2, s3 -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -4080,8 +4080,8 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; VI-NEXT: s_sext_i32_i16 s2, s2 ; VI-NEXT: s_sext_i32_i16 s3, s3 ; VI-NEXT: s_min_i32 s2, s2, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll index 05ff5c8bb0b3a..8adfed45e2514 100644 --- a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll +++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll @@ -91,15 +91,15 @@ define amdgpu_kernel void @withcall() { ; GFX9-NEXT: s_add_u32 s4, s4, nonkernel@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, nonkernel@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[20:21] -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: ds_write_b32 v3, v3 offset:8 @@ -156,15 +156,15 @@ define amdgpu_kernel void @withcall() { ; G_GFX9-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 ; G_GFX9-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 ; G_GFX9-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x0 +; G_GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; G_GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; G_GFX9-NEXT: s_mov_b32 s14, s10 ; G_GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; G_GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; G_GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; G_GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; G_GFX9-NEXT: s_mov_b64 s[0:1], s[20:21] ; G_GFX9-NEXT: v_mov_b32_e32 v3, 0 ; G_GFX9-NEXT: v_mov_b32_e32 v4, 8 ; G_GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; G_GFX9-NEXT: s_mov_b64 s[0:1], s[20:21] ; G_GFX9-NEXT: s_mov_b64 s[2:3], s[22:23] ; G_GFX9-NEXT: s_mov_b64 s[4:5], s[12:13] ; G_GFX9-NEXT: s_mov_b32 s12, s16 diff --git a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll index aff07787a2fb7..79414e52eed9c 100644 --- a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll +++ b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll @@ -10,9 +10,9 @@ define protected amdgpu_kernel void @no_folding_imm_to_inst_with_fi(<4 x i64> %v ; CHECK-NEXT: s_load_b512 s[0:15], s[4:5], 0xa4 ; CHECK-NEXT: s_mov_b64 s[34:35], src_private_base ; CHECK-NEXT: s_movk_i32 s34, 0x80 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: v_dual_mov_b32 v27, s35 :: v_dual_mov_b32 v26, s34 ; CHECK-NEXT: s_add_nc_u64 s[44:45], s[34:35], 0x70 -; CHECK-NEXT: v_dual_mov_b32 v26, s34 :: v_dual_mov_b32 v27, s35 ; CHECK-NEXT: v_dual_mov_b32 v20, s44 :: v_dual_mov_b32 v21, s45 ; CHECK-NEXT: s_wait_kmcnt 0x0 ; CHECK-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v1, s41 diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index 9afaab5ebcfb6..0e55d1df4e9c3 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -278,8 +278,8 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_add_u32 s0, s4, 0x3039 @@ -300,9 +300,9 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out ; GFX8-NEXT: s_mov_b32 s9, 0xf237b ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_add_u32 s0, s4, 0x3039 @@ -361,12 +361,12 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_or_b32 s0, s2, 63 -; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_add_u32 s0, s8, 63 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_addc_u32 s1, s9, 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -383,12 +383,12 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) % ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, s0 ; GFX8-NEXT: s_or_b32 s0, s2, 63 -; GFX8-NEXT: s_mov_b32 s5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s0, s8, 63 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8-NEXT: s_mov_b32 s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll index 89bcfb3b3a834..d8d47ad3cedfd 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll @@ -41,8 +41,8 @@ define amdgpu_cs void @test_simple_indirect_call() { ; GFX10-NEXT: s_bitset0_b32 s11, 21 ; GFX10-NEXT: s_add_u32 s8, s8, s0 ; GFX10-NEXT: s_addc_u32 s9, s9, 0 -; GFX10-NEXT: s_mov_b64 s[0:1], s[8:9] ; GFX10-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX10-NEXT: s_mov_b64 s[0:1], s[8:9] ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/permute.ll b/llvm/test/CodeGen/AMDGPU/permute.ll index c98bcd53bec1a..ff3833245488b 100644 --- a/llvm/test/CodeGen/AMDGPU/permute.ll +++ b/llvm/test/CodeGen/AMDGPU/permute.ll @@ -292,10 +292,10 @@ define amdgpu_kernel void @known_ffff0500(ptr addrspace(1) nocapture %arg, i32 % ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v4, v[0:1] ; GCN-NEXT: s_bitset1_b32 s2, 15 -; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_and_b32 s0, s2, 0xff00 ; GCN-NEXT: s_or_b32 s0, s0, 0xffff0000 +; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_or_b32_e32 v4, 4, v4 ; GCN-NEXT: v_and_b32_e32 v4, 0xff00ff, v4 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 1156f2718cf1e..4f1ac5f6f4683 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -22,8 +22,8 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 @@ -357,8 +357,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 @@ -385,8 +385,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB1_2 Depth 2 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: s_mov_b32 s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: .LBB1_2: ; %for.body ; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1 @@ -1031,8 +1031,8 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 @@ -1348,8 +1348,8 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 @@ -1573,8 +1573,8 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 @@ -1781,8 +1781,8 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX8-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 @@ -2053,8 +2053,8 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 @@ -2391,8 +2391,8 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 @@ -2561,9 +2561,9 @@ define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dword s1, s[4:5], 0xec ; GFX8-NEXT: s_add_u32 s0, 0, -1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_addc_u32 s1, s1, -1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll index 2e15c37bc19fb..5a0ab4ef7b42b 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll @@ -89,8 +89,8 @@ define amdgpu_kernel void @gep_as0_uniform(ptr %p, i64 %offset, ptr %ret) { ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_addc_u32 s1, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[2:3], v5 ; GFX8-NEXT: flat_store_dword v[0:1], v4 @@ -140,7 +140,7 @@ define amdgpu_kernel void @gep_as0_uniform(ptr %p, i64 %offset, ptr %ret) { ; GFX11-NEXT: s_add_u32 s0, s0, s2 ; GFX11-NEXT: s_addc_u32 s1, s1, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: flat_store_b64 v[0:1], v[2:3] ; GFX11-NEXT: s_endpgm ; @@ -246,8 +246,8 @@ define amdgpu_kernel void @multi_gep_as0_uniform(ptr %p, i64 %offset, ptr %ret) ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_addc_u32 s1, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[2:3], v5 ; GFX8-NEXT: flat_store_dword v[0:1], v4 @@ -303,7 +303,7 @@ define amdgpu_kernel void @multi_gep_as0_uniform(ptr %p, i64 %offset, ptr %ret) ; GFX11-NEXT: s_add_u32 s0, s0, 5 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: flat_store_b64 v[0:1], v[2:3] ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir index 4a0bb6ceccd3f..dd1a26eab49b1 100644 --- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir +++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir @@ -50,7 +50,7 @@ body: | ; CHECK-NEXT: renamable $sgpr56 = S_MOV_B32 0 ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, implicit $exec + ; CHECK-NEXT: [[COPY_LANEMASK:%[0-9]+]]:vreg_1024_align2 = COPY_LANEMASK renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, lanemask(0x0000000300000000), implicit $exec ; CHECK-NEXT: renamable $sgpr100_sgpr101 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: renamable $sgpr57 = S_MOV_B32 1083786240 ; CHECK-NEXT: SI_SPILL_S1024_SAVE renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5) @@ -61,7 +61,7 @@ body: | ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr100_sgpr101, implicit-def dead $scc - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_1024_align2 = COPY [[COPY]] + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY [[COPY_LANEMASK]] ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.17 ; CHECK-NEXT: {{ $}} @@ -136,7 +136,7 @@ body: | ; CHECK-NEXT: renamable $sgpr66 = COPY killed renamable $sgpr84 ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr67 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, implicit $exec + ; CHECK-NEXT: [[COPY_LANEMASK:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, implicit $exec ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.11, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.5 ; CHECK-NEXT: {{ $}} @@ -185,7 +185,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr64_sgpr65 = nofpexcept V_CMP_NLT_F64_e64 0, undef $sgpr4_sgpr5, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec ; CHECK-NEXT: renamable $sgpr66_sgpr67 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_INDIRECT_REG_READ_GPR_IDX_B32_V32_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V32 [[COPY1]], undef $sgpr14, 11, implicit-def $m0, implicit $m0, implicit $exec + ; CHECK-NEXT: dead [[V_INDIRECT_REG_READ_GPR_IDX_B32_V32_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V32 [[COPY]], undef $sgpr14, 11, implicit-def $m0, implicit $m0, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.8: ; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.9(0x40000000) @@ -199,8 +199,8 @@ body: | ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY killed renamable $sgpr84_sgpr85, implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR undef %18:vgpr_32, [[COPY2]], undef renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s64), addrspace 1) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed renamable $sgpr84_sgpr85, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR undef %18:vgpr_32, [[COPY1]], undef renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr18_sgpr19, implicit $exec ; CHECK-NEXT: dead renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec ; CHECK-NEXT: renamable $sgpr82 = S_ADD_U32 renamable $sgpr8, 32, implicit-def dead $scc @@ -221,7 +221,7 @@ body: | ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: $sgpr8_sgpr9 = COPY renamable $sgpr82_sgpr83 + ; CHECK-NEXT: $sgpr8_sgpr9 = COPY_LANEMASK renamable $sgpr82_sgpr83, lanemask(0x0000000000000003) ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr8_sgpr9 ; CHECK-NEXT: renamable $sgpr18_sgpr19 = COPY killed renamable $sgpr48_sgpr49 ; CHECK-NEXT: renamable $sgpr14 = COPY killed renamable $sgpr85 diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index b761f689d6af5..70992be391054 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -184,9 +184,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v19, v25, v27 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] ; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX9-NEXT: v_mov_b32_e32 v19, v9 ; GFX9-NEXT: v_or3_b32 v7, v7, 0, v13 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v19, v9 ; GFX9-NEXT: v_mov_b32_e32 v18, v8 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB0_3 @@ -1547,9 +1547,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v21, v23, v25 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] ; GFX9-NEXT: v_and_b32_e32 v12, 1, v30 -; GFX9-NEXT: v_mov_b32_e32 v21, v13 ; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v21, v13 ; GFX9-NEXT: v_mov_b32_e32 v20, v12 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB1_3 @@ -1624,8 +1624,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.mir b/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.mir index 7d11c2deb6658..628560cccf71c 100644 --- a/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.mir +++ b/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: not llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -start-before=greedy,2 -filetype=null %s 2>&1 | FileCheck %s # This testcase fails register allocation at the same time it performs diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll index fc154604b8700..ed38673a16283 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll @@ -96,22 +96,29 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi_loop(ptr addrspace ; CHECK-NEXT: v_mov_b32_e32 v65, 2.0 ; CHECK-NEXT: .LBB1_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(7) ; CHECK-NEXT: s_nop 15 ; CHECK-NEXT: v_mov_b64_e32 v[62:63], v[30:31] ; CHECK-NEXT: v_mov_b64_e32 v[60:61], v[28:29] +; CHECK-NEXT: s_waitcnt vmcnt(6) ; CHECK-NEXT: v_mov_b64_e32 v[58:59], v[26:27] ; CHECK-NEXT: v_mov_b64_e32 v[56:57], v[24:25] +; CHECK-NEXT: s_waitcnt vmcnt(5) ; CHECK-NEXT: v_mov_b64_e32 v[54:55], v[22:23] ; CHECK-NEXT: v_mov_b64_e32 v[52:53], v[20:21] +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: v_mov_b64_e32 v[50:51], v[18:19] ; CHECK-NEXT: v_mov_b64_e32 v[48:49], v[16:17] +; CHECK-NEXT: s_waitcnt vmcnt(3) ; CHECK-NEXT: v_mov_b64_e32 v[46:47], v[14:15] ; CHECK-NEXT: v_mov_b64_e32 v[44:45], v[12:13] +; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: v_mov_b64_e32 v[42:43], v[10:11] ; CHECK-NEXT: v_mov_b64_e32 v[40:41], v[8:9] +; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: v_mov_b64_e32 v[38:39], v[6:7] ; CHECK-NEXT: v_mov_b64_e32 v[36:37], v[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mov_b64_e32 v[34:35], v[2:3] ; CHECK-NEXT: v_mov_b64_e32 v[32:33], v[0:1] ; CHECK-NEXT: s_add_i32 s1, s1, 1 diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index e29be2b744874..6108a5550747c 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -592,6 +592,8 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: v_accvgpr_read_b32 v0, a32 ; CHECK-NEXT: s_nop 15 ; CHECK-NEXT: v_mov_b64_e32 v[2:3], v[32:33] +; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[60:61] +; CHECK-NEXT: v_mov_b64_e32 v[32:33], v[62:63] ; CHECK-NEXT: v_mov_b64_e32 v[4:5], v[34:35] ; CHECK-NEXT: v_mov_b64_e32 v[6:7], v[36:37] ; CHECK-NEXT: v_mov_b64_e32 v[8:9], v[38:39] @@ -605,8 +607,6 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: v_mov_b64_e32 v[24:25], v[54:55] ; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[56:57] ; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[58:59] -; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[60:61] -; CHECK-NEXT: v_mov_b64_e32 v[32:33], v[62:63] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v0, v[30:33], s[0:1] offset:112 ; CHECK-NEXT: global_store_dwordx4 v0, v[26:29], s[0:1] offset:96 diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index a7fcb6439703a..210c31d44418c 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -6,6 +6,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s + define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; R600-LABEL: rotl_i32: ; R600: ; %bb.0: ; %entry @@ -137,9 +138,9 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX8-NEXT: s_and_b32 s3, s6, 31 ; GFX8-NEXT: s_mov_b32 s1, s0 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -272,11 +273,11 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX8-NEXT: s_and_b32 s3, s12, 31 ; GFX8-NEXT: s_mov_b32 s9, s8 ; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s3 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index 71c7797cbc68e..bec261cd97e26 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -6,6 +6,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s + define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; R600-LABEL: rotr_i32: ; R600: ; %bb.0: ; %entry @@ -122,9 +123,9 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX8-NEXT: s_mov_b32 s1, s0 ; GFX8-NEXT: s_lshr_b64 s[2:3], s[6:7], s3 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s8 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -237,11 +238,11 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX8-NEXT: s_mov_b32 s9, s8 ; GFX8-NEXT: s_lshr_b64 s[4:5], s[10:11], s5 ; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s3 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -309,227 +310,6 @@ entry: ret void } -define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i32> %y) { -; R600-LABEL: rotr_v8i32: -; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0 -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 -; R600-NEXT: CF_END -; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[5].X, KC0[5].X, KC0[7].X, -; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[4].W, KC0[4].W, KC0[6].W, -; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[4].Z, KC0[4].Z, KC0[6].Z, -; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[4].Y, KC0[4].Y, KC0[6].Y, -; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T2.W, KC0[6].X, KC0[6].X, KC0[8].X, -; R600-NEXT: BIT_ALIGN_INT * T2.Z, KC0[5].W, KC0[5].W, KC0[7].W, -; R600-NEXT: BIT_ALIGN_INT * T2.Y, KC0[5].Z, KC0[5].Z, KC0[7].Z, -; R600-NEXT: BIT_ALIGN_INT * T2.X, KC0[5].Y, KC0[5].Y, KC0[7].Y, -; R600-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, -; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; R600-NEXT: LSHR * T3.X, PV.W, literal.x, -; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; -; SI-LABEL: rotr_v8i32: -; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s24, s19, 31 -; SI-NEXT: s_mov_b32 s4, s11 -; SI-NEXT: s_mov_b32 s5, s11 -; SI-NEXT: s_and_b32 s25, s18, 31 -; SI-NEXT: s_mov_b32 s11, s10 -; SI-NEXT: s_and_b32 s26, s17, 31 -; SI-NEXT: s_mov_b32 s6, s9 -; SI-NEXT: s_mov_b32 s7, s9 -; SI-NEXT: s_and_b32 s27, s16, 31 -; SI-NEXT: s_mov_b32 s9, s8 -; SI-NEXT: s_and_b32 s23, s23, 31 -; SI-NEXT: s_mov_b32 s16, s15 -; SI-NEXT: s_mov_b32 s17, s15 -; SI-NEXT: s_and_b32 s22, s22, 31 -; SI-NEXT: s_mov_b32 s15, s14 -; SI-NEXT: s_and_b32 s21, s21, 31 -; SI-NEXT: s_mov_b32 s18, s13 -; SI-NEXT: s_mov_b32 s19, s13 -; SI-NEXT: s_and_b32 s20, s20, 31 -; SI-NEXT: s_mov_b32 s13, s12 -; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s24 -; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s25 -; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s26 -; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], s23 -; SI-NEXT: s_lshr_b64 s[14:15], s[14:15], s22 -; SI-NEXT: s_lshr_b64 s[18:19], s[18:19], s21 -; SI-NEXT: s_lshr_b64 s[12:13], s[12:13], s20 -; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s27 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_mov_b32_e32 v1, s18 -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: v_mov_b32_e32 v3, s16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: s_endpgm -; -; GFX8-LABEL: rotr_v8i32: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s4, s19, 31 -; GFX8-NEXT: s_mov_b32 s2, s11 -; GFX8-NEXT: s_mov_b32 s3, s11 -; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 -; GFX8-NEXT: s_and_b32 s3, s17, 31 -; GFX8-NEXT: s_mov_b32 s6, s9 -; GFX8-NEXT: s_mov_b32 s7, s9 -; GFX8-NEXT: s_and_b32 s5, s18, 31 -; GFX8-NEXT: s_mov_b32 s11, s10 -; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s3 -; GFX8-NEXT: s_and_b32 s3, s16, 31 -; GFX8-NEXT: s_mov_b32 s9, s8 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[10:11], s5 -; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s3 -; GFX8-NEXT: s_and_b32 s3, s23, 31 -; GFX8-NEXT: s_mov_b32 s10, s15 -; GFX8-NEXT: s_mov_b32 s11, s15 -; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s3 -; GFX8-NEXT: s_and_b32 s3, s22, 31 -; GFX8-NEXT: s_mov_b32 s15, s14 -; GFX8-NEXT: s_lshr_b64 s[14:15], s[14:15], s3 -; GFX8-NEXT: s_and_b32 s3, s21, 31 -; GFX8-NEXT: s_mov_b32 s16, s13 -; GFX8-NEXT: s_mov_b32 s17, s13 -; GFX8-NEXT: s_lshr_b64 s[16:17], s[16:17], s3 -; GFX8-NEXT: s_and_b32 s3, s20, 31 -; GFX8-NEXT: s_mov_b32 s13, s12 -; GFX8-NEXT: s_lshr_b64 s[12:13], s[12:13], s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 16 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NEXT: v_mov_b32_e32 v1, s16 -; GFX8-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_endpgm -; -; GFX10-LABEL: rotr_v8i32: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s19, s19, 31 -; GFX10-NEXT: s_mov_b32 s2, s11 -; GFX10-NEXT: s_mov_b32 s3, s11 -; GFX10-NEXT: s_and_b32 s17, s17, 31 -; GFX10-NEXT: s_mov_b32 s4, s9 -; GFX10-NEXT: s_mov_b32 s5, s9 -; GFX10-NEXT: s_and_b32 s16, s16, 31 -; GFX10-NEXT: s_mov_b32 s9, s8 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s19 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s17 -; GFX10-NEXT: s_and_b32 s23, s23, 31 -; GFX10-NEXT: s_mov_b32 s6, s15 -; GFX10-NEXT: s_mov_b32 s7, s15 -; GFX10-NEXT: s_and_b32 s22, s22, 31 -; GFX10-NEXT: s_mov_b32 s15, s14 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s16 -; GFX10-NEXT: s_mov_b32 s16, s13 -; GFX10-NEXT: s_mov_b32 s17, s13 -; GFX10-NEXT: s_and_b32 s3, s20, 31 -; GFX10-NEXT: s_mov_b32 s13, s12 -; GFX10-NEXT: s_and_b32 s5, s21, 31 -; GFX10-NEXT: s_and_b32 s18, s18, 31 -; GFX10-NEXT: s_mov_b32 s11, s10 -; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s23 -; GFX10-NEXT: s_lshr_b64 s[14:15], s[14:15], s22 -; GFX10-NEXT: s_lshr_b64 s[12:13], s[12:13], s3 -; GFX10-NEXT: s_lshr_b64 s[16:17], s[16:17], s5 -; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 -; GFX10-NEXT: v_mov_b32_e32 v0, s12 -; GFX10-NEXT: v_mov_b32_e32 v1, s16 -; GFX10-NEXT: v_mov_b32_e32 v2, s14 -; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: v_mov_b32_e32 v4, s8 -; GFX10-NEXT: v_mov_b32_e32 v5, s4 -; GFX10-NEXT: v_mov_b32_e32 v6, s10 -; GFX10-NEXT: v_mov_b32_e32 v7, s2 -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: rotr_v8i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s19, s19, 31 -; GFX11-NEXT: s_mov_b32 s2, s11 -; GFX11-NEXT: s_mov_b32 s3, s11 -; GFX11-NEXT: s_and_b32 s17, s17, 31 -; GFX11-NEXT: s_mov_b32 s4, s9 -; GFX11-NEXT: s_mov_b32 s5, s9 -; GFX11-NEXT: s_and_b32 s16, s16, 31 -; GFX11-NEXT: s_mov_b32 s9, s8 -; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s19 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s17 -; GFX11-NEXT: s_and_b32 s23, s23, 31 -; GFX11-NEXT: s_mov_b32 s6, s15 -; GFX11-NEXT: s_mov_b32 s7, s15 -; GFX11-NEXT: s_and_b32 s22, s22, 31 -; GFX11-NEXT: s_mov_b32 s15, s14 -; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s16 -; GFX11-NEXT: s_mov_b32 s16, s13 -; GFX11-NEXT: s_mov_b32 s17, s13 -; GFX11-NEXT: s_and_b32 s3, s20, 31 -; GFX11-NEXT: s_mov_b32 s13, s12 -; GFX11-NEXT: s_and_b32 s5, s21, 31 -; GFX11-NEXT: s_and_b32 s18, s18, 31 -; GFX11-NEXT: s_mov_b32 s11, s10 -; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s23 -; GFX11-NEXT: s_lshr_b64 s[14:15], s[14:15], s22 -; GFX11-NEXT: s_lshr_b64 s[12:13], s[12:13], s3 -; GFX11-NEXT: s_lshr_b64 s[16:17], s[16:17], s5 -; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 -; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s16 -; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s6 -; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s4 -; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s2 -; GFX11-NEXT: v_mov_b32_e32 v6, s10 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 -; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] -; GFX11-NEXT: s_endpgm -entry: - %tmp0 = sub <8 x i32> , %y - %tmp1 = shl <8 x i32> %x, %tmp0 - %tmp2 = lshr <8 x i32> %x, %y - %tmp3 = or <8 x i32> %tmp1, %tmp2 - store <8 x i32> %tmp3, ptr addrspace(1) %in - ret void -} - declare i16 @llvm.fshr.i16(i16, i16, i16) define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr addrspace(1) nocapture readonly %sourceB, ptr addrspace(1) nocapture %destValues) { diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll index 15fc987d1e7c6..3a08a6e38f493 100644 --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -earlycse-debug-hash < %s | FileCheck -check-prefix=GCN %s + define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_pat1: ; GCN: ; %bb.0: @@ -35,13 +36,13 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s2, s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GCN-NEXT: v_mov_b32_e32 v0, 0x5a ; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: v_mov_b32_e32 v0, 0x5a ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_sad_u32 v2, s2, v0, 20 ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -137,9 +138,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_mov_b32_e32 v3, s2 -; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_addc_u32 s21, s21, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_sad_u32 v2, s0, v2, v3 ; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen @@ -175,11 +176,11 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 ; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_dword v[0:1], v3 ; GCN-NEXT: s_endpgm @@ -213,11 +214,11 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 ; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_dword v[0:1], v3 ; GCN-NEXT: s_endpgm @@ -252,11 +253,11 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_sad_u32 v3, s1, v0, v1 ; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_dword v[0:1], v3 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll index 8861b7726a4c5..a98c04f9d11dd 100644 --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -351,14 +351,14 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_u32 s12, s4, s6 -; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_addc_u32 s13, s5, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] ; SI-NEXT: v_cmp_lt_i64_e64 s[4:5], s[6:7], 0 -; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; SI-NEXT: s_mov_b32 s0, s2 @@ -377,9 +377,9 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s4, s6 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_addc_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -437,8 +437,8 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_addc_u32 s9, s5, s7 ; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0 ; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5] -; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: s_xor_b32 s4, s6, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index fdb20f372ab8d..c73241aa226ac 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GCN-IR %s + define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv: ; GCN: ; %bb.0: @@ -154,8 +155,8 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: s_sub_u32 s4, s4, s6 ; GCN-NEXT: s_subb_u32 s5, s5, s7 -; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -239,9 +240,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[10:11], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s4, s6, s4 ; GCN-IR-NEXT: s_subb_u32 s5, s7, s5 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm @@ -452,8 +453,8 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v16 ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 @@ -1327,9 +1328,9 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s4, s6, s4 ; GCN-IR-NEXT: s_subb_u32 s5, s7, s5 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm @@ -1510,8 +1511,8 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v14 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3 @@ -1703,8 +1704,8 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v14 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 @@ -1797,10 +1798,10 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 ; GCN-IR-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v10 -; GCN-IR-NEXT: v_or_b32_e32 v1, v7, v1 ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 +; GCN-IR-NEXT: v_or_b32_e32 v1, v7, v1 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v6, v2 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 19f0e93c308d8..b539fb548d640 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -2107,8 +2107,8 @@ define void @crash_lshlrevb16_not_reg_op() { ; NOSDWA-NEXT: .LBB22_1: ; %bb1 ; NOSDWA-NEXT: ; =>This Inner Loop Header: Depth=1 ; NOSDWA-NEXT: s_lshl_b32 s7, s4, 3 -; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 ; NOSDWA-NEXT: s_lshr_b32 s7, s6, s7 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 ; NOSDWA-NEXT: s_mov_b64 s[4:5], 1 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s7 @@ -2129,8 +2129,8 @@ define void @crash_lshlrevb16_not_reg_op() { ; GFX89-NEXT: .LBB22_1: ; %bb1 ; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX89-NEXT: s_lshl_b32 s7, s4, 3 -; GFX89-NEXT: v_mov_b32_e32 v0, s4 ; GFX89-NEXT: s_lshr_b32 s7, s6, s7 +; GFX89-NEXT: v_mov_b32_e32 v0, s4 ; GFX89-NEXT: v_mov_b32_e32 v1, s5 ; GFX89-NEXT: s_mov_b64 s[4:5], 1 ; GFX89-NEXT: v_mov_b32_e32 v2, s7 @@ -2151,8 +2151,8 @@ define void @crash_lshlrevb16_not_reg_op() { ; GFX9-NEXT: .LBB22_1: ; %bb1 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_lshl_b32 s7, s4, 3 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_lshr_b32 s7, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_mov_b64 s[4:5], 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 @@ -2174,8 +2174,8 @@ define void @crash_lshlrevb16_not_reg_op() { ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_lshl_b32 s7, s4, 3 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_lshr_b32 s4, s6, s7 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: s_mov_b64 s[4:5], 1 ; GFX10-NEXT: flat_store_byte v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir index f11fe4aa6e00e..2cb85bb3fb3b7 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir @@ -75,8 +75,8 @@ body: | ; GFX9-LABEL: name: sgpr96_aligned_src_dst ; GFX9: liveins: $sgpr0_sgpr1_sgpr2 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr6 = S_MOV_B32 $sgpr2, implicit $sgpr0_sgpr1_sgpr2, implicit-def $sgpr4_sgpr5_sgpr6 - ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2 + ; GFX9-NEXT: $sgpr6 = S_MOV_B32 $sgpr2 + ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr0_sgpr1 $sgpr4_sgpr5_sgpr6 = COPY $sgpr0_sgpr1_sgpr2 ... @@ -88,8 +88,8 @@ body: | ; GFX9-LABEL: name: sgpr96_killed ; GFX9: liveins: $sgpr4_sgpr5_sgpr6 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr10 = S_MOV_B32 $sgpr6, implicit $sgpr4_sgpr5_sgpr6, implicit-def $sgpr8_sgpr9_sgpr10 - ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr4_sgpr5, implicit killed $sgpr4_sgpr5_sgpr6 + ; GFX9-NEXT: $sgpr10 = S_MOV_B32 killed $sgpr6 + ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 killed $sgpr4_sgpr5 $sgpr8_sgpr9_sgpr10 = COPY killed $sgpr4_sgpr5_sgpr6 ... @@ -101,8 +101,8 @@ body: | ; GFX9-LABEL: name: sgpr128_forward ; GFX9: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr6_sgpr7 $sgpr0_sgpr1_sgpr2_sgpr3 = COPY $sgpr4_sgpr5_sgpr6_sgpr7 ... @@ -114,8 +114,8 @@ body: | ; GFX9-LABEL: name: sgpr128_backward ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr0_sgpr1 $sgpr4_sgpr5_sgpr6_sgpr7 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ... @@ -127,8 +127,8 @@ body: | ; GFX9-LABEL: name: sgpr128_killed ; GFX9: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr6_sgpr7, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 killed $sgpr6_sgpr7 $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed $sgpr4_sgpr5_sgpr6_sgpr7 ... @@ -140,9 +140,9 @@ body: | ; GFX9-LABEL: name: sgpr160_forward ; GFX9: liveins: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 - ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; GFX9-NEXT: $sgpr4 = S_MOV_B32 $sgpr12, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9 + ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11 + ; GFX9-NEXT: $sgpr4 = S_MOV_B32 $sgpr12 $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 = COPY $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 ... @@ -154,9 +154,9 @@ body: | ; GFX9-LABEL: name: sgpr160_backward ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr12 = S_MOV_B32 $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 - ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 + ; GFX9-NEXT: $sgpr12 = S_MOV_B32 $sgpr4 + ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1 $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 ... @@ -168,9 +168,9 @@ body: | ; GFX9-LABEL: name: sgpr160_killed ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr12 = S_MOV_B32 $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 - ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 + ; GFX9-NEXT: $sgpr12 = S_MOV_B32 killed $sgpr4 + ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 killed $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 killed $sgpr0_sgpr1 $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 ... @@ -183,9 +183,9 @@ body: | ; GFX9-LABEL: name: sgpr192_forward ; GFX9: liveins: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9 + ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11 + ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr12_sgpr13 $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 = COPY $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ... @@ -197,9 +197,9 @@ body: | ; GFX9-LABEL: name: sgpr192_backward ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1 $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ... @@ -211,9 +211,9 @@ body: | ; GFX9-LABEL: name: sgpr192_killed ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 killed $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 killed $sgpr0_sgpr1 $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ... @@ -225,10 +225,10 @@ body: | ; GFX9-LABEL: name: sgpr256_forward ; GFX9: liveins: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9 + ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11 + ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr12_sgpr13 + ; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr14_sgpr15 $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ... @@ -240,10 +240,10 @@ body: | ; GFX9-LABEL: name: sgpr256_backward ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 $sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1 $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ... @@ -255,10 +255,10 @@ body: | ; GFX9-LABEL: name: sgpr256_killed ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 killed $sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 killed $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 killed $sgpr0_sgpr1 $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ... @@ -270,14 +270,14 @@ body: | ; GFX9-LABEL: name: sgpr512_forward ; GFX9: liveins: $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr16_sgpr17, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr18_sgpr19, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr20_sgpr21, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr22_sgpr23, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr24_sgpr25, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr26_sgpr27, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr28_sgpr29, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 $sgpr30_sgpr31, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 + ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr16_sgpr17 + ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr18_sgpr19 + ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr20_sgpr21 + ; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr22_sgpr23 + ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr24_sgpr25 + ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr26_sgpr27 + ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr28_sgpr29 + ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 $sgpr30_sgpr31 $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 ... @@ -289,14 +289,14 @@ body: | ; GFX9-LABEL: name: sgpr512_backward ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr30_sgpr31 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr28_sgpr29 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr26_sgpr27 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr24_sgpr25 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr22_sgpr23 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr20_sgpr21 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr18_sgpr19 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr16_sgpr17 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX9-NEXT: $sgpr30_sgpr31 = S_MOV_B64 $sgpr14_sgpr15 + ; GFX9-NEXT: $sgpr28_sgpr29 = S_MOV_B64 $sgpr12_sgpr13 + ; GFX9-NEXT: $sgpr26_sgpr27 = S_MOV_B64 $sgpr10_sgpr11 + ; GFX9-NEXT: $sgpr24_sgpr25 = S_MOV_B64 $sgpr8_sgpr9 + ; GFX9-NEXT: $sgpr22_sgpr23 = S_MOV_B64 $sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr20_sgpr21 = S_MOV_B64 $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr18_sgpr19 = S_MOV_B64 $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr16_sgpr17 = S_MOV_B64 $sgpr0_sgpr1 $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ... @@ -308,14 +308,14 @@ body: | ; GFX9-LABEL: name: sgpr512_killed ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr30_sgpr31 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr28_sgpr29 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr26_sgpr27 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr24_sgpr25 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr22_sgpr23 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr20_sgpr21 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr18_sgpr19 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr16_sgpr17 = S_MOV_B64 $sgpr0_sgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX9-NEXT: $sgpr30_sgpr31 = S_MOV_B64 killed $sgpr14_sgpr15 + ; GFX9-NEXT: $sgpr28_sgpr29 = S_MOV_B64 killed $sgpr12_sgpr13 + ; GFX9-NEXT: $sgpr26_sgpr27 = S_MOV_B64 killed $sgpr10_sgpr11 + ; GFX9-NEXT: $sgpr24_sgpr25 = S_MOV_B64 killed $sgpr8_sgpr9 + ; GFX9-NEXT: $sgpr22_sgpr23 = S_MOV_B64 killed $sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr20_sgpr21 = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr18_sgpr19 = S_MOV_B64 killed $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr16_sgpr17 = S_MOV_B64 killed $sgpr0_sgpr1 $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ... @@ -327,22 +327,22 @@ body: | ; GFX9-LABEL: name: sgpr1024_forward ; GFX9: liveins: $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr32_sgpr33, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr34_sgpr35, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr36_sgpr37, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr38_sgpr39, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr40_sgpr41, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr42_sgpr43, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr44_sgpr45, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 $sgpr46_sgpr47, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr16_sgpr17 = S_MOV_B64 $sgpr48_sgpr49, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr18_sgpr19 = S_MOV_B64 $sgpr50_sgpr51, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr20_sgpr21 = S_MOV_B64 $sgpr52_sgpr53, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr22_sgpr23 = S_MOV_B64 $sgpr54_sgpr55, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr24_sgpr25 = S_MOV_B64 $sgpr56_sgpr57, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr26_sgpr27 = S_MOV_B64 $sgpr58_sgpr59, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr28_sgpr29 = S_MOV_B64 $sgpr60_sgpr61, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr30_sgpr31 = S_MOV_B64 $sgpr62_sgpr63, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 + ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr32_sgpr33 + ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr34_sgpr35 + ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr36_sgpr37 + ; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr38_sgpr39 + ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr40_sgpr41 + ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr42_sgpr43 + ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr44_sgpr45 + ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 $sgpr46_sgpr47 + ; GFX9-NEXT: $sgpr16_sgpr17 = S_MOV_B64 $sgpr48_sgpr49 + ; GFX9-NEXT: $sgpr18_sgpr19 = S_MOV_B64 $sgpr50_sgpr51 + ; GFX9-NEXT: $sgpr20_sgpr21 = S_MOV_B64 $sgpr52_sgpr53 + ; GFX9-NEXT: $sgpr22_sgpr23 = S_MOV_B64 $sgpr54_sgpr55 + ; GFX9-NEXT: $sgpr24_sgpr25 = S_MOV_B64 $sgpr56_sgpr57 + ; GFX9-NEXT: $sgpr26_sgpr27 = S_MOV_B64 $sgpr58_sgpr59 + ; GFX9-NEXT: $sgpr28_sgpr29 = S_MOV_B64 $sgpr60_sgpr61 + ; GFX9-NEXT: $sgpr30_sgpr31 = S_MOV_B64 $sgpr62_sgpr63 $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = COPY $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 ... @@ -354,22 +354,22 @@ body: | ; GFX9-LABEL: name: sgpr1024_backward ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr62_sgpr63 = S_MOV_B64 $sgpr30_sgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr60_sgpr61 = S_MOV_B64 $sgpr28_sgpr29, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr58_sgpr59 = S_MOV_B64 $sgpr26_sgpr27, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr56_sgpr57 = S_MOV_B64 $sgpr24_sgpr25, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr54_sgpr55 = S_MOV_B64 $sgpr22_sgpr23, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr52_sgpr53 = S_MOV_B64 $sgpr20_sgpr21, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr50_sgpr51 = S_MOV_B64 $sgpr18_sgpr19, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr48_sgpr49 = S_MOV_B64 $sgpr16_sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr46_sgpr47 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr44_sgpr45 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr42_sgpr43 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr40_sgpr41 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr38_sgpr39 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr36_sgpr37 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr34_sgpr35 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr32_sgpr33 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 + ; GFX9-NEXT: $sgpr62_sgpr63 = S_MOV_B64 $sgpr30_sgpr31 + ; GFX9-NEXT: $sgpr60_sgpr61 = S_MOV_B64 $sgpr28_sgpr29 + ; GFX9-NEXT: $sgpr58_sgpr59 = S_MOV_B64 $sgpr26_sgpr27 + ; GFX9-NEXT: $sgpr56_sgpr57 = S_MOV_B64 $sgpr24_sgpr25 + ; GFX9-NEXT: $sgpr54_sgpr55 = S_MOV_B64 $sgpr22_sgpr23 + ; GFX9-NEXT: $sgpr52_sgpr53 = S_MOV_B64 $sgpr20_sgpr21 + ; GFX9-NEXT: $sgpr50_sgpr51 = S_MOV_B64 $sgpr18_sgpr19 + ; GFX9-NEXT: $sgpr48_sgpr49 = S_MOV_B64 $sgpr16_sgpr17 + ; GFX9-NEXT: $sgpr46_sgpr47 = S_MOV_B64 $sgpr14_sgpr15 + ; GFX9-NEXT: $sgpr44_sgpr45 = S_MOV_B64 $sgpr12_sgpr13 + ; GFX9-NEXT: $sgpr42_sgpr43 = S_MOV_B64 $sgpr10_sgpr11 + ; GFX9-NEXT: $sgpr40_sgpr41 = S_MOV_B64 $sgpr8_sgpr9 + ; GFX9-NEXT: $sgpr38_sgpr39 = S_MOV_B64 $sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr36_sgpr37 = S_MOV_B64 $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr34_sgpr35 = S_MOV_B64 $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr32_sgpr33 = S_MOV_B64 $sgpr0_sgpr1 $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 ... @@ -381,21 +381,21 @@ body: | ; GFX9-LABEL: name: sgpr1024_killed ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr62_sgpr63 = S_MOV_B64 $sgpr30_sgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr60_sgpr61 = S_MOV_B64 $sgpr28_sgpr29, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr58_sgpr59 = S_MOV_B64 $sgpr26_sgpr27, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr56_sgpr57 = S_MOV_B64 $sgpr24_sgpr25, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr54_sgpr55 = S_MOV_B64 $sgpr22_sgpr23, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr52_sgpr53 = S_MOV_B64 $sgpr20_sgpr21, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr50_sgpr51 = S_MOV_B64 $sgpr18_sgpr19, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr48_sgpr49 = S_MOV_B64 $sgpr16_sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr46_sgpr47 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr44_sgpr45 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr42_sgpr43 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr40_sgpr41 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr38_sgpr39 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr36_sgpr37 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr34_sgpr35 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr32_sgpr33 = S_MOV_B64 $sgpr0_sgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 + ; GFX9-NEXT: $sgpr62_sgpr63 = S_MOV_B64 killed $sgpr30_sgpr31 + ; GFX9-NEXT: $sgpr60_sgpr61 = S_MOV_B64 killed $sgpr28_sgpr29 + ; GFX9-NEXT: $sgpr58_sgpr59 = S_MOV_B64 killed $sgpr26_sgpr27 + ; GFX9-NEXT: $sgpr56_sgpr57 = S_MOV_B64 killed $sgpr24_sgpr25 + ; GFX9-NEXT: $sgpr54_sgpr55 = S_MOV_B64 killed $sgpr22_sgpr23 + ; GFX9-NEXT: $sgpr52_sgpr53 = S_MOV_B64 killed $sgpr20_sgpr21 + ; GFX9-NEXT: $sgpr50_sgpr51 = S_MOV_B64 killed $sgpr18_sgpr19 + ; GFX9-NEXT: $sgpr48_sgpr49 = S_MOV_B64 killed $sgpr16_sgpr17 + ; GFX9-NEXT: $sgpr46_sgpr47 = S_MOV_B64 killed $sgpr14_sgpr15 + ; GFX9-NEXT: $sgpr44_sgpr45 = S_MOV_B64 killed $sgpr12_sgpr13 + ; GFX9-NEXT: $sgpr42_sgpr43 = S_MOV_B64 killed $sgpr10_sgpr11 + ; GFX9-NEXT: $sgpr40_sgpr41 = S_MOV_B64 killed $sgpr8_sgpr9 + ; GFX9-NEXT: $sgpr38_sgpr39 = S_MOV_B64 killed $sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr36_sgpr37 = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr34_sgpr35 = S_MOV_B64 killed $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr32_sgpr33 = S_MOV_B64 killed $sgpr0_sgpr1 $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 ... diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll index 076fff7612428..fe7f1d85acfa0 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll @@ -29,13 +29,13 @@ define amdgpu_kernel void @kernel() { ; GCN-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-NEXT: s_mov_b32 s14, s10 ; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: s_mov_b64 s[0:1], s[36:37] ; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 +; GCN-NEXT: s_mov_b64 s[0:1], s[36:37] ; GCN-NEXT: s_mov_b64 s[2:3], s[38:39] ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll index 2b698d3ee4854..8eb09d6884351 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -8,11 +8,11 @@ define amdgpu_kernel void @v_uextract_bit_31_i128(ptr addrspace(1) %out, ptr add ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 4 -; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: buffer_load_dword v0, v[4:5], s[8:11], 0 addr64 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -73,10 +73,10 @@ define amdgpu_kernel void @v_uextract_bit_95_i128(ptr addrspace(1) %out, ptr add ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 4 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] ; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: buffer_load_dword v0, v[4:5], s[8:11], 0 addr64 offset:8 ; GCN-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll index 760a126afa995..f7c8320739ef1 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -11,11 +11,11 @@ define amdgpu_kernel void @v_uextract_bit_31_i64(ptr addrspace(1) %out, ptr addr ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] @@ -69,11 +69,11 @@ define amdgpu_kernel void @v_uextract_bit_1_i64(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] @@ -98,11 +98,11 @@ define amdgpu_kernel void @v_uextract_bit_20_i64(ptr addrspace(1) %out, ptr addr ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] @@ -183,11 +183,11 @@ define amdgpu_kernel void @v_uextract_bit_20_21_i64(ptr addrspace(1) %out, ptr a ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] @@ -212,11 +212,11 @@ define amdgpu_kernel void @v_uextract_bit_1_30_i64(ptr addrspace(1) %out, ptr ad ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] @@ -241,11 +241,11 @@ define amdgpu_kernel void @v_uextract_bit_1_31_i64(ptr addrspace(1) %out, ptr ad ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] @@ -272,11 +272,11 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64(ptr addrspace(1) %out, ptr a ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 ; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll index 1c2215d39dc02..37b5060ce7566 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll @@ -21064,10 +21064,10 @@ define void @s_shuffle_v2i64_v8i64__12_6() { ; GFX900-NEXT: v_writelane_b32 v0, s50, 6 ; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s46, s16 ; GFX900-NEXT: s_mov_b32 s47, s17 @@ -21105,10 +21105,10 @@ define void @s_shuffle_v2i64_v8i64__12_6() { ; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 ; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s46, s16 ; GFX90A-NEXT: s_mov_b32 s47, s17 @@ -21288,10 +21288,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() { ; GFX900-NEXT: v_writelane_b32 v0, s50, 6 ; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s50, s16 ; GFX900-NEXT: s_mov_b32 s51, s17 @@ -21329,10 +21329,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() { ; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 ; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s50, s16 ; GFX90A-NEXT: s_mov_b32 s51, s17 @@ -21364,10 +21364,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() { ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s30, s12 ; GFX942-NEXT: s_mov_b32 s31, s13 @@ -22124,10 +22124,10 @@ define void @s_shuffle_v2i64_v8i64__12_7() { ; GFX900-NEXT: v_writelane_b32 v0, s50, 6 ; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s46, s18 ; GFX900-NEXT: s_mov_b32 s47, s19 @@ -22165,10 +22165,10 @@ define void @s_shuffle_v2i64_v8i64__12_7() { ; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 ; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s46, s18 ; GFX90A-NEXT: s_mov_b32 s47, s19 @@ -22348,10 +22348,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() { ; GFX900-NEXT: v_writelane_b32 v0, s50, 6 ; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s50, s18 ; GFX900-NEXT: s_mov_b32 s51, s19 @@ -22389,10 +22389,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() { ; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 ; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s50, s18 ; GFX90A-NEXT: s_mov_b32 s51, s19 @@ -22424,10 +22424,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() { ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s30, s14 ; GFX942-NEXT: s_mov_b32 s31, s15 diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index 8fcaf5e15f7d5..64e20a0d284a7 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -30,17 +30,17 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_add_u32 s0, s0, s17 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_getpc_b64 s[18:19] -; GFX9-NEXT: s_add_u32 s18, s18, indirect@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s19, s19, indirect@rel32@hi+12 ; GFX9-NEXT: s_mov_b32 s13, s15 ; GFX9-NEXT: s_mov_b32 s12, s14 ; GFX9-NEXT: s_mov_b64 s[14:15], src_private_base -; GFX9-NEXT: v_mov_b32_e32 v5, s18 +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, indirect@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, indirect@rel32@hi+12 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-NEXT: v_mov_b32_e32 v5, s18 ; GFX9-NEXT: v_mov_b32_e32 v6, s19 ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b32 s14, s16 diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll index 101787abf8ea7..f1f0d737b08fc 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -61,9 +61,9 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -78,9 +78,9 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -356,9 +356,9 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -373,9 +373,9 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -437,9 +437,9 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -454,9 +454,9 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -549,9 +549,9 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0, 0xbff00000 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -566,9 +566,9 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0, 0xbff00000 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll index 41ffd01fc7e23..c88557e7d0ea3 100644 --- a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll +++ b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll @@ -7,9 +7,9 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) %in, ptr addrspace(1) ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX942-NEXT: v_mov_b64_e32 v[12:13], v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index 76f8f484fc763..24e737d50459e 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -359,9 +359,9 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0 ; VI-NEXT: s_or_b32 s2, s4, s2 ; VI-NEXT: s_add_i32 s3, s3, 0x20000 ; VI-NEXT: s_add_i32 s2, s2, 0x20000 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir index cf23a9d1e8a57..6f8e80277a492 100644 --- a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir +++ b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir @@ -45,7 +45,7 @@ body: | ; CHECK-NEXT: $vgpr1 = COPY renamable $sgpr55 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: $vcc = COPY renamable $sgpr34_sgpr35 + ; CHECK-NEXT: $vcc = COPY_LANEMASK renamable $sgpr34_sgpr35, lanemask(0x000000000000000C) ; CHECK-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll index 1a0f75e048cb9..cc48b4dd02e67 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -31,9 +31,9 @@ define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg, ; GFX908-NEXT: .LBB0_2: ; %use ; GFX908-NEXT: s_nop 2 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a7 ; GFX908-NEXT: v_accvgpr_write_b32 a4, 4 ; GFX908-NEXT: v_accvgpr_write_b32 a8, 5 ; GFX908-NEXT: v_accvgpr_write_b32 a9, 1 @@ -83,6 +83,7 @@ define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg, ; GFX90A-NEXT: v_accvgpr_read_b32 v7, a5 ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a4 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, 4 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, 5 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, 1 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, 2 @@ -90,7 +91,6 @@ define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg, ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v7 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, v8 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, v9 @@ -128,9 +128,9 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: ;;#ASMSTART @@ -139,8 +139,8 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 { ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v3 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v3 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v1 ; GFX908-NEXT: ;;#ASMSTART @@ -394,11 +394,11 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 { ; GFX908-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill ; GFX908-NEXT: s_waitcnt vmcnt(4) ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v0, a[0:3] ; GFX908-NEXT: s_nop 3 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index b5474b8974b29..39af96cd9762a 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10399,11 +10399,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20e0 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20f0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v11 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v13 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v14 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload @@ -10411,6 +10408,9 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v13 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v14 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v3 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v2 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, v1 @@ -10544,124 +10544,124 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35] ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v88, v58 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v92, v62 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v87, v57 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v86, v56 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v85, v55 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v91, v61 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v90, v60 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v89, v59 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v60, v34 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[63:66], s0 ; 16-byte Folded Spill -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v68, v38 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v59, v33 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v58, v32 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v57, v31 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v67, v37 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v66, v36 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v65, v35 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v10 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v72, v42 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v76, v46 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v80, v50 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v84, v54 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v7 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v71, v41 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v8 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v70, v40 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v69, v39 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v40, v14 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v75, v45 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v74, v44 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v73, v43 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v44, v18 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v79, v49 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v78, v48 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v77, v47 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v48, v22 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v83, v53 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v82, v52 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v81, v51 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v52, v26 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v56, v30 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v65, v35 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v8 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v11 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v41, v15 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v45, v19 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v49, v23 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v53, v27 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v13 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v88, v58 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v58, v32 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v68, v38 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v72, v42 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v12 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v76, v46 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v42, v16 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v21 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v80, v50 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v46, v20 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v51, v25 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v84, v54 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v50, v24 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v29 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v54, v28 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ;;#ASMEND -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, v33 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, v53 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v49 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v45 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, v41 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, v37 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v56 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v52 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v48 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v44 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, v40 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, v34 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, v35 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v36 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, v57 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v60 +; GFX10-FLATSCR-NEXT: ;;#ASMSTART +; GFX10-FLATSCR-NEXT: ;;#ASMEND +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, v35 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v65 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v66 +; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[63:66], off, s0 ; 16-byte Folded Reload +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v87, v57 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v85, v55 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v92, v62 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v91, v61 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v89, v59 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v59, v33 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v57, v31 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v67, v37 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v7 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v71, v41 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v69, v39 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v13 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v11 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v75, v45 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v73, v43 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v41, v15 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v79, v49 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v77, v47 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v21 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v45, v19 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v83, v53 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v81, v51 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v51, v25 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v49, v23 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v53, v27 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v54 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v55 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v56 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v50 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v49 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v55 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, v53 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v51 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v52 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v45 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, v46 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, v47 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v48 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, v41 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v42 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, v43 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v44 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, v37 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, v38 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, v39 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, v40 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, v33 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, v57 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v58 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v59 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v60 -; GFX10-FLATSCR-NEXT: ;;#ASMSTART -; GFX10-FLATSCR-NEXT: ;;#ASMEND -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v65 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v66 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v67 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v68 -; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[63:66], off, s0 ; 16-byte Folded Reload ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v59, v89 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v85 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v51, v81 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v77 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v73 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v69 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v60, v90 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v61, v91 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v62, v92 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v85 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v56, v86 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v57, v87 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v58, v88 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v51, v81 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v52, v82 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v53, v83 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v54, v84 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v77 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v48, v78 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v49, v79 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v50, v80 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v73 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v44, v74 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v45, v75 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v46, v76 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v69 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v40, v70 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v41, v71 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v42, v72 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v67 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v68 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index e12e31b14e97d..cf24ebe9be2c1 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -4,6 +4,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck %s --check-prefixes=TONGA ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck %s --check-prefixes=EG + define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i16_7: ; GCN: ; %bb.0: @@ -1649,8 +1650,8 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_cmp_ge_u32 s3, s2 ; GCN-NEXT: s_cselect_b32 s8, s4, s3 ; GCN-NEXT: .LBB8_3: -; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GCN-NEXT: s_endpgm @@ -3332,8 +3333,8 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_add_u32 s0, s6, 16 ; TONGA-NEXT: s_addc_u32 s1, s7, 0 ; TONGA-NEXT: v_mov_b32_e32 v0, s0 -; TONGA-NEXT: v_mov_b32_e32 v4, s6 ; TONGA-NEXT: v_mov_b32_e32 v1, s1 +; TONGA-NEXT: v_mov_b32_e32 v4, s6 ; TONGA-NEXT: v_mov_b32_e32 v5, s7 ; TONGA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; TONGA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -6120,18 +6121,18 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_add_u32 s0, s6, 48 -; TONGA-NEXT: v_mov_b32_e32 v0, s6 ; TONGA-NEXT: s_addc_u32 s1, s7, 0 -; TONGA-NEXT: v_mov_b32_e32 v1, s7 ; TONGA-NEXT: s_add_u32 s2, s6, 32 -; TONGA-NEXT: flat_load_dwordx4 v[14:17], v[0:1] +; TONGA-NEXT: v_mov_b32_e32 v0, s6 +; TONGA-NEXT: v_mov_b32_e32 v1, s7 ; TONGA-NEXT: s_addc_u32 s3, s7, 0 +; TONGA-NEXT: flat_load_dwordx4 v[14:17], v[0:1] ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 ; TONGA-NEXT: flat_load_dwordx4 v[10:13], v[0:1] ; TONGA-NEXT: v_mov_b32_e32 v0, s0 -; TONGA-NEXT: v_mov_b32_e32 v1, s1 ; TONGA-NEXT: s_add_u32 s0, s6, 16 +; TONGA-NEXT: v_mov_b32_e32 v1, s1 ; TONGA-NEXT: s_addc_u32 s1, s7, 0 ; TONGA-NEXT: v_mov_b32_e32 v5, s1 ; TONGA-NEXT: v_mov_b32_e32 v4, s0 @@ -6688,11 +6689,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; TONGA-NEXT: v_cndmask_b32_e32 v14, v0, v1, vcc ; TONGA-NEXT: .LBB12_14: +; TONGA-NEXT: s_add_u32 s0, s4, 16 ; TONGA-NEXT: v_mov_b32_e32 v0, s4 ; TONGA-NEXT: v_mov_b32_e32 v1, s5 -; TONGA-NEXT: s_add_u32 s0, s4, 16 -; TONGA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; TONGA-NEXT: s_addc_u32 s1, s5, 0 +; TONGA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; TONGA-NEXT: v_mov_b32_e32 v0, s0 ; TONGA-NEXT: v_mov_b32_e32 v1, s1 ; TONGA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] @@ -9033,9 +9034,9 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; TONGA-NEXT: v_mov_b32_e32 v4, s4 ; TONGA-NEXT: v_mov_b32_e32 v5, s5 ; TONGA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; TONGA-NEXT: v_mov_b32_e32 v9, s1 ; TONGA-NEXT: v_mov_b32_e32 v8, s0 ; TONGA-NEXT: s_add_u32 s0, s0, 16 +; TONGA-NEXT: v_mov_b32_e32 v9, s1 ; TONGA-NEXT: s_addc_u32 s1, s1, 0 ; TONGA-NEXT: v_mov_b32_e32 v11, s1 ; TONGA-NEXT: v_mov_b32_e32 v10, s0 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 02d2e6c1473ab..4400d52e46fbb 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GCN-IR %s + define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem: ; GCN: ; %bb.0: @@ -433,8 +434,8 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v9, v11, vcc ; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v16 ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 @@ -1189,9 +1190,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s4, s6, s4 ; GCN-IR-NEXT: s_subb_u32 s5, s7, s5 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm @@ -1659,8 +1660,8 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3 @@ -1850,8 +1851,8 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 @@ -1950,10 +1951,10 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 ; GCN-IR-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12 -; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3 diff --git a/llvm/test/CodeGen/AMDGPU/ssubo.ll b/llvm/test/CodeGen/AMDGPU/ssubo.ll index 382d8928a28b0..a5ed626849228 100644 --- a/llvm/test/CodeGen/AMDGPU/ssubo.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubo.ll @@ -348,14 +348,14 @@ define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_u32 s12, s4, s6 -; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_subb_u32 s13, s5, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] ; SI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[6:7], 0 -; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; SI-NEXT: s_mov_b32 s0, s2 @@ -374,9 +374,9 @@ define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_u32 s0, s4, s6 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_subb_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -434,8 +434,8 @@ define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_subb_u32 s9, s5, s7 ; GFX11-NEXT: v_cmp_gt_i64_e64 s6, s[6:7], 0 ; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5] -; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: s_xor_b32 s4, s6, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll index d33e94809b326..d201b1124a3a3 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -16,19 +16,19 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; MUBUF-NEXT: s_mov_b32 s38, -1 ; MUBUF-NEXT: s_mov_b32 s39, 0x31c16000 ; MUBUF-NEXT: s_add_u32 s36, s36, s11 -; MUBUF-NEXT: s_addc_u32 s37, s37, 0 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x2000 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 ; MUBUF-NEXT: v_mov_b32_e32 v3, 0 ; MUBUF-NEXT: v_mov_b32_e32 v4, 0x400000 +; MUBUF-NEXT: s_addc_u32 s37, s37, 0 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12 +; MUBUF-NEXT: s_mov_b64 s[2:3], s[38:39] ; MUBUF-NEXT: s_mov_b32 s32, 0xc0000 ; MUBUF-NEXT: s_waitcnt lgkmcnt(0) ; MUBUF-NEXT: v_mov_b32_e32 v0, s0 ; MUBUF-NEXT: s_mov_b64 s[0:1], s[36:37] -; MUBUF-NEXT: s_mov_b64 s[2:3], s[38:39] ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; MUBUF-NEXT: s_and_saveexec_b32 s0, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index d2394bab82c77..a8d93c61d0424 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -855,13 +855,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-OPT-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 ; WAVE32-OPT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; WAVE32-OPT-NEXT: s_movk_i32 s32, 0x1200 +; WAVE32-OPT-NEXT: v_mov_b32_e32 v3, 42 +; WAVE32-OPT-NEXT: v_mov_b32_e32 v4, 17 ; WAVE32-OPT-NEXT: s_mov_b32 s13, s9 +; WAVE32-OPT-NEXT: v_or3_b32 v31, v0, v1, v2 ; WAVE32-OPT-NEXT: s_mov_b32 s12, s8 ; WAVE32-OPT-NEXT: s_mov_b64 s[8:9], s[4:5] ; WAVE32-OPT-NEXT: s_mov_b32 s4, s32 -; WAVE32-OPT-NEXT: v_mov_b32_e32 v3, 42 -; WAVE32-OPT-NEXT: v_mov_b32_e32 v4, 17 -; WAVE32-OPT-NEXT: v_or3_b32 v31, v0, v1, v2 ; WAVE32-OPT-NEXT: s_mov_b32 s14, s10 ; WAVE32-OPT-NEXT: s_mov_b32 s17, stack_passed_argument@abs32@hi ; WAVE32-OPT-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo @@ -892,13 +892,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-OPT-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 ; WAVE64-OPT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; WAVE64-OPT-NEXT: s_movk_i32 s32, 0x2400 +; WAVE64-OPT-NEXT: v_mov_b32_e32 v3, 42 +; WAVE64-OPT-NEXT: v_mov_b32_e32 v4, 17 ; WAVE64-OPT-NEXT: s_mov_b32 s13, s9 +; WAVE64-OPT-NEXT: v_or3_b32 v31, v0, v1, v2 ; WAVE64-OPT-NEXT: s_mov_b32 s12, s8 ; WAVE64-OPT-NEXT: s_mov_b64 s[8:9], s[4:5] ; WAVE64-OPT-NEXT: s_mov_b32 s4, s32 -; WAVE64-OPT-NEXT: v_mov_b32_e32 v3, 42 -; WAVE64-OPT-NEXT: v_mov_b32_e32 v4, 17 -; WAVE64-OPT-NEXT: v_or3_b32 v31, v0, v1, v2 ; WAVE64-OPT-NEXT: s_mov_b32 s14, s10 ; WAVE64-OPT-NEXT: s_mov_b32 s17, stack_passed_argument@abs32@hi ; WAVE64-OPT-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll index a4e23ae87614f..b2b9f5bc93365 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -67,9 +67,9 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, s6 +; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: ds_store_b128 v4, v[0:3] ; GFX11-NEXT: s_endpgm store <4 x i32> %x, ptr addrspace(3) %out @@ -521,8 +521,8 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-NEXT: s_endpgm @@ -535,8 +535,8 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX7-NEXT: s_endpgm @@ -575,10 +575,9 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, s6 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: ds_store_2addr_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX11-NEXT: s_endpgm store <4 x i32> %x, ptr addrspace(3) %out, align 8 @@ -647,9 +646,9 @@ define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, s6 +; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: ds_store_b128 v4, v[0:3] ; GFX11-NEXT: s_endpgm store <4 x i32> %x, ptr addrspace(3) %out, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index 480eb0dd5fe9c..d9308e9188add 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -55,8 +55,8 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; HAWAII-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HAWAII-NEXT: s_add_u32 s0, s8, 14 ; HAWAII-NEXT: s_addc_u32 s1, s9, 0 -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: s_mov_b32 flat_scratch_lo, s13 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: v_mov_b32_e32 v1, s1 ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] ; HAWAII-NEXT: s_load_dword s2, s[8:9], 0x0 @@ -291,8 +291,8 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: ds_write_b8 v2, v3 offset:8 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll index b4036517cc0d5..af206fb40e5b2 100644 --- a/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll +++ b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll @@ -94,8 +94,8 @@ define amdgpu_kernel void @test_loop_with_if( ptr %ptr, i1 %cond) #0 { ; GFX900-NEXT: s_bitcmp1_b32 s2, 0 ; GFX900-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX900-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[2:3] -; GFX900-NEXT: v_mov_b32_e32 v2, s1 ; GFX900-NEXT: s_xor_b64 s[2:3], s[2:3], -1 +; GFX900-NEXT: v_mov_b32_e32 v2, s1 ; GFX900-NEXT: v_mov_b32_e32 v1, s0 ; GFX900-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v3 ; GFX900-NEXT: s_branch .LBB2_2 diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index d10ef709f8e33..5cbf4d66314d9 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -321,8 +321,8 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s2, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_addc_u32 s3, s3, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 @@ -666,8 +666,8 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 ; GFX8-NEXT: s_sub_u32 s0, s0, s2 ; GFX8-NEXT: s_subb_u32 s1, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm @@ -692,8 +692,9 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[2:3] -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-NEXT: s_endpgm %result = sub i64 %a, %b @@ -944,9 +945,9 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13] -; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: v_mov_b32_e32 v16, s0 ; GFX8-NEXT: s_add_u32 s0, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 @@ -958,8 +959,8 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v10, v14 ; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v11, v15, vcc ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v8, v12 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v9, v13, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll index 4621be5cab450..007a384ca9299 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -12,20 +12,15 @@ define amdgpu_kernel void @foobar(float %a0, float %a1, ptr addrspace(1) %out) # ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: v_mov_b32_e32 v2, s6 -; CHECK-NEXT: v_mov_b32_e32 v3, s7 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc ; CHECK-NEXT: ; %bb.1: ; %ift ; CHECK-NEXT: s_mov_b32 s4, s5 ; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: v_mov_b32_e32 v2, s6 -; CHECK-NEXT: v_mov_b32_e32 v3, s7 ; CHECK-NEXT: ; %bb.2: ; %ife ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll index d4a8a0d762afd..81383725c5ac5 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=4 -o - %s | FileCheck %s + ; Make sure we can rematerialize split 64-bit constants (which ; MachineLICM hoisted out of the loop) and avoid spilling inside the ; loop. @@ -15,16 +16,16 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce) ; CHECK-LABEL: _Z6kernelILi4000ELi1EEvPd: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b64 s[2:3], 0x100 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 -; CHECK-NEXT: s_load_dword s0, s[0:1], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 ; CHECK-NEXT: s_mov_b32 s2, 0 +; CHECK-NEXT: s_load_dword s0, s[0:1], 0x0 ; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: s_mov_b32 s3, 0x40260000 ; CHECK-NEXT: s_mov_b32 s5, 0x40280000 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: .LBB0_1: ; %for.cond4.preheader ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index 9c0beb2ed358c..1996a8e272c5a 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -135,8 +135,8 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; HSA-TRAP-GFX803-NEXT: v_cmp_eq_u32_e32 vcc, -1, v0 ; HSA-TRAP-GFX803-NEXT: s_cbranch_vccz .LBB1_2 ; HSA-TRAP-GFX803-NEXT: ; %bb.1: ; %ret -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 3 +; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1 ; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) @@ -423,9 +423,9 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 ; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1 +; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1 ; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store.ll b/llvm/test/CodeGen/AMDGPU/trunc-store.ll index 1d96921ec1287..6ac3a4205613b 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-store.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-store.ll @@ -94,11 +94,11 @@ define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(ptr addrspace(1) %out, ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -201,11 +201,11 @@ define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(ptr addrspace(1) %out, ; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: s_or_b32 s5, s7, s5 ; VI-NEXT: s_or_b32 s0, s0, s4 -; VI-NEXT: v_mov_b32_e32 v4, s34 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v4, s34 ; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll index 76f60f1e5dbfc..a6c126a937e2a 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc.ll @@ -99,8 +99,8 @@ define amdgpu_kernel void @trunc_shl_i64(ptr addrspace(1) %out2, ptr addrspace(1 ; SI-NEXT: s_mov_b32 s1, s3 ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 1f93bf7a68972..0a3ef92e851a3 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -1760,8 +1760,8 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_add_u32 s6, s2, 2 ; GCN-NEXT: s_addc_u32 s7, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_add_u32 s6, s2, 6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_addc_u32 s7, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 @@ -1960,8 +1960,8 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_add_u32 s6, s2, 2 ; GCN-NEXT: s_addc_u32 s7, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_add_u32 s6, s2, 6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_addc_u32 s7, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 1c50f930facba..5620ad29f6c36 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GCN-IR %s + define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv_i64: ; GCN: ; %bb.0: @@ -186,9 +187,9 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[2:3] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm @@ -372,8 +373,8 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v7 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 @@ -961,9 +962,9 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 ; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] ; GCN-IR-NEXT: .LBB8_5: ; %udiv-end -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm @@ -1132,8 +1133,8 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3 @@ -1215,10 +1216,10 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 ; GCN-IR-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 -; GCN-IR-NEXT: v_or_b32_e32 v3, v7, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v7, v5 +; GCN-IR-NEXT: v_or_b32_e32 v3, v7, v3 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v6, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB10_3 @@ -1324,9 +1325,9 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 ; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] ; GCN-IR-NEXT: .LBB11_5: ; %udiv-end -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm @@ -1413,10 +1414,10 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 ; GCN-IR-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 -; GCN-IR-NEXT: v_or_b32_e32 v3, v7, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v7, v5 +; GCN-IR-NEXT: v_or_b32_e32 v3, v7, v3 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v6, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll index eaab3531824c4..93c9dd4b672b3 100644 --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -218,6 +218,7 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX8-NEXT: s_sub_i32 s6, 0, s2 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -240,7 +241,6 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; GFX8-NEXT: s_cselect_b32 s0, s6, s0 ; GFX8-NEXT: s_sub_i32 s2, 0, s3 ; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll index 983acfc2c0699..6f60ce0b64787 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -211,9 +211,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 ; SI-NEXT: v_ldexp_f64 v[6:7], v[8:9], 32 ; SI-NEXT: v_ldexp_f64 v[8:9], v[10:11], 32 ; SI-NEXT: s_add_u32 s0, s8, 16 -; SI-NEXT: s_addc_u32 s1, s9, 0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] ; SI-NEXT: v_add_f64 v[4:5], v[8:9], v[12:13] +; SI-NEXT: s_addc_u32 s1, s9, 0 ; SI-NEXT: v_mov_b32_e32 v9, s1 ; SI-NEXT: v_mov_b32_e32 v8, s0 ; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] @@ -250,8 +250,8 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 ; VI-NEXT: s_add_u32 s0, s8, 16 ; VI-NEXT: s_addc_u32 s1, s9, 0 ; VI-NEXT: v_mov_b32_e32 v11, s1 -; VI-NEXT: v_mov_b32_e32 v8, s8 ; VI-NEXT: v_mov_b32_e32 v10, s0 +; VI-NEXT: v_mov_b32_e32 v8, s8 ; VI-NEXT: v_mov_b32_e32 v9, s9 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -368,11 +368,11 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 ; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s2 -; SI-NEXT: s_add_u32 s0, s4, 16 ; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 +; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; SI-NEXT: s_add_u32 s0, s4, 16 ; SI-NEXT: s_addc_u32 s1, s5, 0 ; SI-NEXT: v_mov_b32_e32 v9, s1 ; SI-NEXT: v_mov_b32_e32 v8, s0 @@ -391,11 +391,11 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], s2 -; VI-NEXT: s_add_u32 s0, s4, 16 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 +; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; VI-NEXT: s_add_u32 s0, s4, 16 ; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_mov_b32_e32 v8, s0 @@ -437,9 +437,9 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm @@ -454,9 +454,9 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -615,9 +615,9 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm @@ -632,9 +632,9 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -696,9 +696,9 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm @@ -713,9 +713,9 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -778,9 +778,9 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000 -; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm @@ -795,9 +795,9 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll index 22e4a24435f12..ce52b91723371 100644 --- a/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll @@ -209,8 +209,8 @@ define amdgpu_ps i64 @s_underflow_compare_fold_i64(i64 inreg %a, i64 inreg %b) # ; GFX9-LABEL: s_underflow_compare_fold_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s2, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s3, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll index fc32bc644ddcd..e85536559a0ca 100644 --- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll +++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s + declare ptr @G() define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x i32> %vec) { @@ -94,13 +95,11 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: .LBB0_4: ; %Flow8 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] ; CHECK-NEXT: v_pk_mov_b32 v[0:1], v[42:43], v[42:43] op_sel:[0,1] -; CHECK-NEXT: v_pk_mov_b32 v[2:3], v[44:45], v[44:45] op_sel:[0,1] ; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB0_8 ; CHECK-NEXT: ; %bb.5: ; %LeafBlock ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v42 ; CHECK-NEXT: v_pk_mov_b32 v[0:1], v[42:43], v[42:43] op_sel:[0,1] -; CHECK-NEXT: v_pk_mov_b32 v[2:3], v[44:45], v[44:45] op_sel:[0,1] ; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc ; CHECK-NEXT: ; %bb.6: ; %sw.bb.i.i.i.i ; CHECK-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll index 31708a9b738db..549adae64ab37 100644 --- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll @@ -103,9 +103,6 @@ define amdgpu_kernel void @partially_undef_copy() #0 { ; CHECK-NEXT: v_mov_b32_e32 v6, 6 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_mov_b32_e32 v0, v5 -; CHECK-NEXT: v_mov_b32_e32 v1, v6 -; CHECK-NEXT: v_mov_b32_e32 v2, v7 -; CHECK-NEXT: v_mov_b32_e32 v3, v8 ; CHECK-NEXT: s_mov_b32 s3, 0xf000 ; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: v_mov_b32_e32 v0, v6 diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll index a81d9a458e23a..69e2eedaa4c86 100644 --- a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll @@ -64,6 +64,22 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg, ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[32:63], v[0:31] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 +; CHECK-NEXT: v_accvgpr_write_b32 a41, v9 +; CHECK-NEXT: v_accvgpr_write_b32 a40, v8 +; CHECK-NEXT: v_accvgpr_write_b32 a39, v7 +; CHECK-NEXT: v_accvgpr_write_b32 a38, v6 +; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 +; CHECK-NEXT: v_accvgpr_write_b32 a35, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a34, v2 +; CHECK-NEXT: v_accvgpr_write_b32 a33, v1 +; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def v[6:9] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_accvgpr_write_b32 a63, v31 ; CHECK-NEXT: v_accvgpr_write_b32 a62, v30 ; CHECK-NEXT: v_accvgpr_write_b32 a61, v29 @@ -86,24 +102,8 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg, ; CHECK-NEXT: v_accvgpr_write_b32 a44, v12 ; CHECK-NEXT: v_accvgpr_write_b32 a43, v11 ; CHECK-NEXT: v_accvgpr_write_b32 a42, v10 -; CHECK-NEXT: v_accvgpr_write_b32 a41, v9 -; CHECK-NEXT: v_accvgpr_write_b32 a40, v8 -; CHECK-NEXT: v_accvgpr_write_b32 a39, v7 -; CHECK-NEXT: v_accvgpr_write_b32 a38, v6 ; CHECK-NEXT: v_accvgpr_write_b32 a37, v5 ; CHECK-NEXT: v_accvgpr_write_b32 a36, v4 -; CHECK-NEXT: v_accvgpr_write_b32 a35, v3 -; CHECK-NEXT: v_accvgpr_write_b32 a34, v2 -; CHECK-NEXT: v_accvgpr_write_b32 a33, v1 -; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[6:9] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND @@ -268,6 +268,8 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[32:63], v[0:31] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_accvgpr_write_b32 a63, v31 ; CHECK-NEXT: v_accvgpr_write_b32 a62, v30 ; CHECK-NEXT: v_accvgpr_write_b32 a61, v29 @@ -299,9 +301,7 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar ; CHECK-NEXT: v_accvgpr_write_b32 a35, v3 ; CHECK-NEXT: v_accvgpr_write_b32 a34, v2 ; CHECK-NEXT: v_accvgpr_write_b32 a33, v1 -; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 ; CHECK-NEXT: v_accvgpr_read_b32 v7, a3 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_accvgpr_read_b32 v6, a2 ; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 28e6627b87413..69f1f73683c0f 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GCN-IR %s + define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_urem_i64: ; GCN: ; %bb.0: @@ -408,8 +409,8 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v9, v11, vcc ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v14 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 @@ -1268,8 +1269,8 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB8_3 @@ -1357,10 +1358,10 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 ; GCN-IR-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v10 -; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3 diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index ca93fcf3f55a2..cc2a62d49006b 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -48,10 +48,10 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2361,10 +2361,10 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2473,11 +2473,11 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 ; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 99b6ab7a6401b..8d29bca970541 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -248,8 +248,8 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2 ; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16 ; GISEL-VI-NEXT: s_or_b32 s2, s2, s3 -; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-VI-NEXT: flat_store_dword v[0:1], v2 ; GISEL-VI-NEXT: s_endpgm @@ -749,8 +749,8 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2 ; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16 ; GISEL-VI-NEXT: s_or_b32 s2, s2, s3 -; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-VI-NEXT: flat_store_dword v[0:1], v2 ; GISEL-VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index 6bf6d540299f1..1922f61b56793 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -2212,10 +2212,10 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 -; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3] ; GFX11-NEXT: s_endpgm %ld8 = load <8 x i32>, ptr addrspace(4) %in, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 0ae31be32ed51..efa59eb8a9ae5 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -5,6 +5,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-early-ifcvt=1 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032,GFX10DEFWAVE %s + define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_i32: ; GFX1032: ; %bb.0: @@ -872,8 +873,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_cmp_ge_u32 s2, s0 ; GFX1032-NEXT: s_cselect_b32 s8, s3, s1 ; GFX1032-NEXT: .LBB15_3: -; GFX1032-NEXT: v_mov_b32_e32 v0, s8 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v0, s8 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 ; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] offset:16 ; GFX1032-NEXT: s_endpgm @@ -1024,8 +1025,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: s_cmp_ge_u32 s2, s0 ; GFX1064-NEXT: s_cselect_b32 s4, s3, s1 ; GFX1064-NEXT: .LBB15_3: -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: v_mov_b32_e32 v1, s5 ; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] offset:16 ; GFX1064-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll index bfad131dc4413..96808e5c39ab5 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll @@ -1273,11 +1273,10 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i ; GISEL-NEXT: s_mov_b32 s1, s6 ; GISEL-NEXT: s_mov_b32 s2, s7 ; GISEL-NEXT: s_mov_b32 s3, s8 -; GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) +; GISEL-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s1 ; GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GISEL-NEXT: v_mov_b32_e32 v5, s9 +; GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s9 ; GISEL-NEXT: scratch_store_b32 off, v4, s10 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: scratch_store_b128 off, v[0:3], s11 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 0fdc1a83dddbd..57e0a0dfb3ea8 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -1943,9 +1943,9 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { ; GFX9-W64-NEXT: s_cbranch_execz .LBB35_4 ; GFX9-W64-NEXT: .LBB35_2: ; %loop ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s4, v8 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s4, v8 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v6 ; GFX9-W64-NEXT: v_mov_b32_e32 v3, v7 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 72672c8b6efad..1e27497dac736 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -422,13 +422,13 @@ define amdgpu_kernel void @call(ptr addrspace(8) %tmp14, i32 %arg) { ; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[20:21] ; GFX9-O3-NEXT: s_addc_u32 s9, s5, 0 ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 -; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O3-NEXT: s_getpc_b64 s[22:23] ; GFX9-O3-NEXT: s_add_u32 s22, s22, called@rel32@lo+4 ; GFX9-O3-NEXT: s_addc_u32 s23, s23, called@rel32@hi+12 +; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] @@ -678,8 +678,8 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) %tmp14, i64 %arg) { ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 ; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v6 @@ -1252,13 +1252,13 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) %tmp14, i32 %arg) { ; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[20:21] ; GFX9-O3-NEXT: s_addc_u32 s9, s5, 0 ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 -; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O3-NEXT: s_getpc_b64 s[22:23] ; GFX9-O3-NEXT: s_add_u32 s22, s22, strict_wwm_called@rel32@lo+4 ; GFX9-O3-NEXT: s_addc_u32 s23, s23, strict_wwm_called@rel32@hi+12 +; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] @@ -1508,8 +1508,8 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) %tmp14, i64 %arg ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 ; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v6 diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll index 92280b9ad8acf..8e10f21e3a089 100644 --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -643,9 +643,9 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s1, s1, 0xf237b ; VI-NEXT: s_xor_b32 s0, s0, 0x3039 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -685,13 +685,13 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou ; VI-NEXT: s_mov_b32 s7, 0xf237b ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s2, 0x3039 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_addc_u32 s1, s3, 0xf237b +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] @@ -725,9 +725,9 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s0, s0, 63 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -757,8 +757,8 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], -8 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll index 049ee47af9681..0edfb2503a08c 100644 --- a/llvm/test/CodeGen/X86/shift-i128.ll +++ b/llvm/test/CodeGen/X86/shift-i128.ll @@ -606,18 +606,22 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; i686-NEXT: shll %cl, %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl %ecx, %eax ; i686-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; i686-NEXT: negl %ebx ; i686-NEXT: movl 76(%esp,%ebx), %ebx +; i686-NEXT: movl %eax, %ecx ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; i686-NEXT: shldl %cl, %esi, %ebx ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; i686-NEXT: movl %edi, %esi ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; i686-NEXT: shll %cl, %esi +; i686-NEXT: movl %ecx, %eax ; i686-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; i686-NEXT: negl %edx ; i686-NEXT: movl 108(%esp,%edx), %edx +; i686-NEXT: movl %eax, %ecx ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; i686-NEXT: shldl %cl, %eax, %edx ; i686-NEXT: movl 72(%ebp), %eax diff --git a/llvm/test/CodeGen/X86/swift-return.ll b/llvm/test/CodeGen/X86/swift-return.ll index f9b73d4eaf92c..c39be0beff0c7 100644 --- a/llvm/test/CodeGen/X86/swift-return.ll +++ b/llvm/test/CodeGen/X86/swift-return.ll @@ -412,9 +412,9 @@ define swiftcc { i32, i32, i32, i32 } @gen7(i32 %key) { ; CHECK-LABEL: gen7: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: movl %edi, %edx -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: movl %edi, %r8d +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: movl %eax, %r8d ; CHECK-NEXT: retq ; ; CHECK-O0-LABEL: gen7: @@ -435,9 +435,9 @@ define swiftcc { i64, i64, i64, i64 } @gen8(i64 %key) { ; CHECK-LABEL: gen8: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq %rdi, %rdx -; CHECK-NEXT: movq %rdi, %rcx -; CHECK-NEXT: movq %rdi, %r8 +; CHECK-NEXT: movq %rax, %rdx +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: movq %rax, %r8 ; CHECK-NEXT: retq ; ; CHECK-O0-LABEL: gen8: @@ -483,9 +483,9 @@ define swiftcc { double, double, double, double, i64, i64, i64, i64 } @gen10(dou ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm0, %xmm2 ; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: movq %rdi, %rdx -; CHECK-NEXT: movq %rdi, %rcx -; CHECK-NEXT: movq %rdi, %r8 +; CHECK-NEXT: movq %rax, %rdx +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: movq %rax, %r8 ; CHECK-NEXT: retq ; ; CHECK-O0-LABEL: gen10: