From 0a6cd828aae0ad6b42a0e5f0feaf3859c6511810 Mon Sep 17 00:00:00 2001 From: vikashgu Date: Fri, 25 Jul 2025 08:37:53 +0000 Subject: [PATCH 1/4] [CodeGen]Encode liveness for copy used MO after virtRegRewriter. As for the Greedy RA, the virtRegRewriter pass is the last place that holds livenes info, even at subregister level. So, now that information can be extracted and encoded on COPY instruction. This information for COPY can later be used to identify partially live regsiters precisely, assuming the liveness information used is not invalidated by any kind if IR muatation later. --- llvm/include/llvm/Target/Target.td | 2 +- llvm/lib/CodeGen/VirtRegMap.cpp | 89 +++++++++++++++++++++++++++++- 2 files changed, 89 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td index 315de55b75510..ae2181151b351 100644 --- a/llvm/include/llvm/Target/Target.td +++ b/llvm/include/llvm/Target/Target.td @@ -1346,7 +1346,7 @@ def REG_SEQUENCE : StandardPseudoInstruction { } def COPY : StandardPseudoInstruction { let OutOperandList = (outs unknown:$dst); - let InOperandList = (ins unknown:$src); + let InOperandList = (ins unknown:$src, variable_ops); let AsmString = ""; let hasSideEffects = false; let isAsCheapAsAMove = true; diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp index 972bd8f550e8b..1b299305bd450 100644 --- a/llvm/lib/CodeGen/VirtRegMap.cpp +++ b/llvm/lib/CodeGen/VirtRegMap.cpp @@ -213,6 +213,8 @@ class VirtRegRewriter { void rewrite(); void addMBBLiveIns(); bool readsUndefSubreg(const MachineOperand &MO) const; + uint64_t calcLiveRegUnitMask(const MachineOperand &MO, + MCRegister PhysReg) const; void addLiveInsForSubRanges(const LiveInterval &LI, MCRegister PhysReg) const; void handleIdentityCopy(MachineInstr &MI); void expandCopyBundle(MachineInstr &MI) const; @@ -474,6 +476,77 @@ bool VirtRegRewriter::readsUndefSubreg(const MachineOperand &MO) const { return true; } +// Return LaneBitmask value as unint64_t for PhysReg assigned to MO, +// representing its live register units at its parent MI. In case of undef or +// fully live MO, return 0u. +uint64_t VirtRegRewriter::calcLiveRegUnitMask(const MachineOperand &MO, + MCRegister PhysReg) const { + Register Reg = MO.getReg(); + const LiveInterval &LI = LIS->getInterval(Reg); + const MachineInstr &MI = *MO.getParent(); + SlotIndex MIIndex = LIS->getInstructionIndex(MI); + unsigned SubRegIdx = MO.getSubReg(); + LaneBitmask UseMask = SubRegIdx + ? TRI->getSubRegIndexLaneMask(SubRegIdx) + : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg) + : LaneBitmask::getNone()); + + LaneBitmask LiveRegUnitMask; + DenseSet LiveRegUnits; + + // dbgs() << "\n********** " << printReg(Reg, TRI) << "[ " << + // printReg(PhysReg, TRI) << " ]" << " **********\n"; + + if (MO.isUndef()) + return 0u; + + assert(LI.liveAt(MIIndex) && + "Reads of completely dead register should be marked undef already"); + + if (LI.hasSubRanges()) { + for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { + unsigned Unit = (*Units).first; + LaneBitmask Mask = (*Units).second; + for (const LiveInterval::SubRange &S : LI.subranges()) { + if ((S.LaneMask & UseMask & Mask).any() && S.liveAt(MIIndex)) { + LiveRegUnits.insert(Unit); + } + } + } + } else { + for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { + unsigned Unit = (*Units).first; + const LiveRange &UnitRange = LIS->getRegUnit(Unit); + LaneBitmask Mask = (*Units).second; + + if (UnitRange.liveAt(MIIndex) && (UseMask & Mask).any()) + LiveRegUnits.insert(Unit); + } + } + + // Consider the exact subregister & create new UseMask as per the RC for it. + if (SubRegIdx != 0) { + PhysReg = TRI->getSubReg(PhysReg, SubRegIdx); + UseMask = (TRI->getMinimalPhysRegClass(PhysReg))->getLaneMask(); + } + + for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { + unsigned Unit = (*Units).first; + LaneBitmask Mask = (*Units).second; + if (LiveRegUnits.count(Unit)) { + // dbgs() << "LIVE DEF UNIT : " << printRegUnit(Unit, TRI) << '\n'; + LiveRegUnitMask |= Mask; + } + } + + // dbgs() << "UseMask : " << PrintLaneMask(UseMask) << '\n'; + // dbgs() << "LiveRegUnitMask : " << PrintLaneMask(LiveRegUnitMask) << '\n'; + if (UseMask == LiveRegUnitMask) + return 0u; + + return LiveRegUnitMask.getAsInteger(); +} + void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) { if (!MI.isIdentityCopy()) return; @@ -495,7 +568,11 @@ void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) { // give us additional liveness information: The target (super-)register // must not be valid before this point. Replace the COPY with a KILL // instruction to maintain this information. - if (MI.getOperand(1).isUndef() || MI.getNumOperands() > 2) { + + // Avoid COPY with an exact 3 operand, wiith third operand be Mask, as + // it same as a COPY with no additional liveness information. + if (MI.getOperand(1).isUndef() || MI.getNumOperands() > 3 || + (MI.getNumOperands() == 3 && !MI.getOperand(2).isImm())) { MI.setDesc(TII->get(TargetOpcode::KILL)); LLVM_DEBUG(dbgs() << " replace by: " << MI); return; @@ -641,11 +718,14 @@ void VirtRegRewriter::rewrite() { SmallVector SuperDeads; SmallVector SuperDefs; SmallVector SuperKills; + uint64_t Mask; for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end(); MBBI != MBBE; ++MBBI) { LLVM_DEBUG(MBBI->print(dbgs(), Indexes)); for (MachineInstr &MI : llvm::make_early_inc_range(MBBI->instrs())) { + // reset for each MI. + Mask = 0u; for (MachineOperand &MO : MI.operands()) { // Make sure MRI knows about registers clobbered by regmasks. if (MO.isRegMask()) @@ -663,6 +743,9 @@ void VirtRegRewriter::rewrite() { RewriteRegs.insert(PhysReg); assert(!MRI->isReserved(PhysReg) && "Reserved register assignment"); + if (MO.isUse() && MI.isCopy()) + Mask = calcLiveRegUnitMask(MO, PhysReg); + // Preserve semantics of sub-register operands. unsigned SubReg = MO.getSubReg(); if (SubReg != 0) { @@ -739,6 +822,10 @@ void VirtRegRewriter::rewrite() { MO.setIsRenamable(true); } + // Add LaneBitmask as MO_Imm + if (MI.isCopy() && Mask) + MI.addOperand(*MF, MachineOperand::CreateImm(Mask)); + // Add any missing super-register kills after rewriting the whole // instruction. while (!SuperKills.empty()) From e14f6a36a87a6497e9144309a90aaff4bce895fa Mon Sep 17 00:00:00 2001 From: vikashgu Date: Mon, 28 Jul 2025 09:49:08 +0000 Subject: [PATCH 2/4] [AMDGPU][CopyPhysReg] Expand the COPY using the encoded liveness mask. We will now use the liveness encoded during VirtRegRewriter for COPY instruction to expand only defined registers, thus avoiding the undefined registers. It enables us to stop using implicit and implicit-def avoiding unnecessary false dependency among the registers. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 148 +-- llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir | 1028 ++++++++--------- ...hys-reg-implicit-operand-kills-subregs.mir | 5 +- llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir | 493 ++++---- 4 files changed, 836 insertions(+), 838 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 6d2110957002a..6149b6d969717 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -694,16 +694,8 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII, I->clearRegisterKills(DefOp.getReg(), &RI); } - MachineInstrBuilder Builder = - BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) - .add(DefOp); - if (ImpDefSuperReg) - Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); - - if (ImpUseSuperReg) { - Builder.addReg(ImpUseSuperReg, - getKillRegState(KillSrc) | RegState::Implicit); - } + BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) + .add(DefOp); return; } @@ -747,27 +739,26 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp) .addReg(SrcReg, getKillRegState(KillSrc)); - if (ImpUseSuperReg) { - UseBuilder.addReg(ImpUseSuperReg, - getKillRegState(KillSrc) | RegState::Implicit); - } - MachineInstrBuilder DefBuilder - = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) - .addReg(Tmp, RegState::Kill); - - if (ImpDefSuperReg) - DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); + BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) + .addReg(Tmp, RegState::Kill); } static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, - const TargetRegisterClass *RC, bool Forward) { + const TargetRegisterClass *RC, bool Forward, + uint64_t LiveRegUnitMaskVal) { const SIRegisterInfo &RI = TII.getRegisterInfo(); ArrayRef BaseIndices = RI.getRegSplitParts(RC, 4); MachineBasicBlock::iterator I = MI; - MachineInstr *FirstMI = nullptr, *LastMI = nullptr; + bool isSrcRegFullLive = LiveRegUnitMaskVal == 0; + + uint64_t TestMaskVal = 0x0000000000000003; + uint8_t ShiftVal = 2; + + if (!Forward) + TestMaskVal = TestMaskVal << (ShiftVal * (BaseIndices.size() - 1)); for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) { int16_t SubIdx = BaseIndices[Idx]; @@ -775,41 +766,47 @@ static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); assert(DestSubReg && SrcSubReg && "Failed to find subregs!"); unsigned Opcode = AMDGPU::S_MOV_B32; + bool IsFirstSubreg = Idx == 0; + + if (!IsFirstSubreg) { + TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal; + } + + // Check for liveness of current subregister using TestMaskVal. + if (!isSrcRegFullLive && (LiveRegUnitMaskVal & TestMaskVal) == uint64_t(0)) + continue; // Is SGPR aligned? If so try to combine with next. bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0; bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0; - if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) { + bool isSrc64Live = true; + + if (!isSrcRegFullLive) + isSrc64Live = Forward + ? ((LiveRegUnitMaskVal & (TestMaskVal << ShiftVal)) != + uint64_t(0)) + : ((LiveRegUnitMaskVal & (TestMaskVal >> ShiftVal)) != + uint64_t(0)); + + if (isSrc64Live && AlignedDest && AlignedSrc && + (Idx + 1 < BaseIndices.size())) { // Can use SGPR64 copy unsigned Channel = RI.getChannelFromSubReg(SubIdx); SubIdx = RI.getSubRegFromChannel(Channel, 2); DestSubReg = RI.getSubReg(DestReg, SubIdx); SrcSubReg = RI.getSubReg(SrcReg, SubIdx); assert(DestSubReg && SrcSubReg && "Failed to find subregs!"); + TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal; Opcode = AMDGPU::S_MOV_B64; Idx++; } - LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg) - .addReg(SrcSubReg) - .addReg(SrcReg, RegState::Implicit); - - if (!FirstMI) - FirstMI = LastMI; + BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg) + .addReg(SrcSubReg, getKillRegState(KillSrc)); if (!Forward) I--; } - - assert(FirstMI && LastMI); - if (!Forward) - std::swap(FirstMI, LastMI); - - FirstMI->addOperand( - MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/)); - - if (KillSrc) - LastMI->addRegisterKilled(SrcReg, &RI); } void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, @@ -822,6 +819,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg); unsigned SrcSize = RI.getRegSizeInBits(*SrcRC); + uint64_t LiveRegUnitMaskVal = 0; + if (MI->getNumOperands() > 2 && MI->getOperand(2).isImm()) { + LiveRegUnitMaskVal = MI->getOperand(2).getImm(); + } + + bool isSrcRegFullLive = LiveRegUnitMaskVal == 0; + // The rest of copyPhysReg assumes Src and Dst size are the same size. // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can // we remove Fix16BitCopies and this code block? @@ -1043,16 +1047,15 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } if (ST.hasPkMovB32()) { BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg) - .addImm(SISrcMods::OP_SEL_1) - .addReg(SrcReg) - .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) - .addReg(SrcReg) - .addImm(0) // op_sel_lo - .addImm(0) // op_sel_hi - .addImm(0) // neg_lo - .addImm(0) // neg_hi - .addImm(0) // clamp - .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); + .addImm(SISrcMods::OP_SEL_1) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addImm(0) // op_sel_lo + .addImm(0) // op_sel_hi + .addImm(0) // neg_lo + .addImm(0) // neg_hi + .addImm(0); // clamp return; } } @@ -1065,12 +1068,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg); expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC, - Forward); + Forward, LiveRegUnitMaskVal); return; } unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; + uint64_t TestMaskVal = 0x0000000000000003; + uint8_t ShiftVal = 2; if (RI.isAGPRClass(RC)) { if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC)) Opcode = AMDGPU::V_ACCVGPR_MOV_B32; @@ -1085,12 +1090,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, (RI.isProperlyAlignedRC(*RC) && (SrcRC == RC || RI.isSGPRClass(SrcRC)))) { // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov. + // TODO: In case of partial liveness, could do mix of 64-bit and 32-bit + // moves. Look expandSGPRCopy function for reference. if (ST.hasMovB64()) { Opcode = AMDGPU::V_MOV_B64_e32; EltSize = 8; + TestMaskVal = 0x000000000000000F; + ShiftVal = 4; } else if (ST.hasPkMovB32()) { Opcode = AMDGPU::V_PK_MOV_B32; EltSize = 8; + TestMaskVal = 0x000000000000000F; + ShiftVal = 4; } } @@ -1105,6 +1116,10 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, ArrayRef SubIndices = RI.getRegSplitParts(RC, EltSize); + // The TestMaskVal will scan from right to left. + if (!Forward) + TestMaskVal = TestMaskVal << (ShiftVal * (SubIndices.size() - 1)); + // If there is an overlap, we can't kill the super-register on the last // instruction, since it will also kill the components made live by this def. const bool Overlap = RI.regsOverlap(SrcReg, DestReg); @@ -1121,7 +1136,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, assert(DestSubReg && SrcSubReg && "Failed to find subregs!"); bool IsFirstSubreg = Idx == 0; - bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1; + bool UseKill = CanKillSuperReg; + + if (!IsFirstSubreg) { + TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal; + } + + if (!isSrcRegFullLive && (LiveRegUnitMaskVal & TestMaskVal) == uint64_t(0)) + continue; if (Opcode == AMDGPU::INSTRUCTION_LIST_END) { Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register(); @@ -1132,24 +1154,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg) .addImm(SISrcMods::OP_SEL_1) - .addReg(SrcSubReg) + .addReg(SrcSubReg, getKillRegState(UseKill)) .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) - .addReg(SrcSubReg) - .addImm(0) // op_sel_lo - .addImm(0) // op_sel_hi - .addImm(0) // neg_lo - .addImm(0) // neg_hi - .addImm(0) // clamp - .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); - if (IsFirstSubreg) - MIB.addReg(DestReg, RegState::Define | RegState::Implicit); + .addReg(SrcSubReg, getKillRegState(UseKill)) + .addImm(0) // op_sel_lo + .addImm(0) // op_sel_hi + .addImm(0) // neg_lo + .addImm(0) // neg_hi + .addImm(0); // clamp } else { MachineInstrBuilder Builder = - BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg); - if (IsFirstSubreg) - Builder.addReg(DestReg, RegState::Define | RegState::Implicit); - - Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); + BuildMI(MBB, MI, DL, get(Opcode), DestSubReg) + .addReg(SrcSubReg, getKillRegState(UseKill)); } } } diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir index d22a4b978980f..4e9797e2686cd 100644 --- a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir @@ -88,22 +88,22 @@ body: | ; GFX908-LABEL: name: a2_to_v2 ; GFX908: liveins: $agpr0_agpr1 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1 - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1 ; ; GFX90A-LABEL: name: a2_to_v2 ; GFX90A: liveins: $agpr0_agpr1 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1 - ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1 ; ; GFX942-LABEL: name: a2_to_v2 ; GFX942: liveins: $agpr0_agpr1 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1 - ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1 $vgpr0_vgpr1 = COPY killed $agpr0_agpr1, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1 @@ -119,25 +119,25 @@ body: | ; GFX908-LABEL: name: a3_to_v3 ; GFX908: liveins: $agpr0_agpr1_agpr2 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $agpr0_agpr1_agpr2 - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2 - ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2 ; ; GFX90A-LABEL: name: a3_to_v3 ; GFX90A: liveins: $agpr0_agpr1_agpr2 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $agpr0_agpr1_agpr2 - ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2 - ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2 ; ; GFX942-LABEL: name: a3_to_v3 ; GFX942: liveins: $agpr0_agpr1_agpr2 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $agpr0_agpr1_agpr2 - ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2 - ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 = COPY killed $agpr0_agpr1_agpr2, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2 @@ -152,28 +152,28 @@ body: | ; GFX908-LABEL: name: a4_to_v4 ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3 ; ; GFX90A-LABEL: name: a4_to_v4 ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3 ; ; GFX942-LABEL: name: a4_to_v4 ; GFX942: liveins: $agpr0_agpr1_agpr2_agpr3 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX942-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $agpr0_agpr1_agpr2_agpr3, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3 @@ -189,40 +189,40 @@ body: | ; GFX908-LABEL: name: a8_to_v8 ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec + ; GFX908-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec + ; GFX908-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec + ; GFX908-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec + ; GFX908-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; ; GFX90A-LABEL: name: a8_to_v8 ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec + ; GFX90A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec + ; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec + ; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; ; GFX942-LABEL: name: a8_to_v8 ; GFX942: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX942-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec + ; GFX942-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec + ; GFX942-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 @@ -237,64 +237,64 @@ body: | ; GFX908-LABEL: name: a16_to_v16 ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec + ; GFX908-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec + ; GFX908-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec + ; GFX908-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec + ; GFX908-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec + ; GFX908-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec + ; GFX908-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec + ; GFX908-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec + ; GFX908-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec + ; GFX908-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec + ; GFX908-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec + ; GFX908-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec + ; GFX908-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; ; GFX90A-LABEL: name: a16_to_v16 ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec + ; GFX90A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec + ; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec + ; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec + ; GFX90A-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec + ; GFX90A-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec + ; GFX90A-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec + ; GFX90A-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec + ; GFX90A-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec + ; GFX90A-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec + ; GFX90A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec + ; GFX90A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; ; GFX942-LABEL: name: a16_to_v16 ; GFX942: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX942-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec + ; GFX942-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec + ; GFX942-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec + ; GFX942-NEXT: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec + ; GFX942-NEXT: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec + ; GFX942-NEXT: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec + ; GFX942-NEXT: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec + ; GFX942-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec + ; GFX942-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec + ; GFX942-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec + ; GFX942-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -336,22 +336,22 @@ body: | ; GFX908-LABEL: name: v2_to_a2 ; GFX908: liveins: $vgpr0_vgpr1 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1 - ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1 ; ; GFX90A-LABEL: name: v2_to_a2 ; GFX90A: liveins: $vgpr0_vgpr1 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1 ; ; GFX942-LABEL: name: v2_to_a2 ; GFX942: liveins: $vgpr0_vgpr1 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1 $agpr0_agpr1 = COPY killed $vgpr0_vgpr1, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1 @@ -366,25 +366,25 @@ body: | ; GFX908-LABEL: name: v3_to_a3 ; GFX908: liveins: $vgpr0_vgpr1_vgpr2 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $vgpr0_vgpr1_vgpr2 - ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 - ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ; ; GFX90A-LABEL: name: v3_to_a3 ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $vgpr0_vgpr1_vgpr2 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ; ; GFX942-LABEL: name: v3_to_a3 ; GFX942: liveins: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $vgpr0_vgpr1_vgpr2 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 $agpr0_agpr1_agpr2 = COPY killed $vgpr0_vgpr1_vgpr2, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 @@ -399,28 +399,28 @@ body: | ; GFX908-LABEL: name: v4_to_a4 ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 ; ; GFX90A-LABEL: name: v4_to_a4 ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 ; ; GFX942-LABEL: name: v4_to_a4 ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 $agpr0_agpr1_agpr2_agpr3 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 @@ -435,40 +435,40 @@ body: | ; GFX908-LABEL: name: v8_to_a8 ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec + ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec + ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec + ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec + ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; ; GFX90A-LABEL: name: v8_to_a8 ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec + ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec + ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec + ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec + ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; ; GFX942-LABEL: name: v8_to_a8 ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 @@ -483,64 +483,64 @@ body: | ; GFX908-LABEL: name: v16_to_a16 ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec + ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec + ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec + ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec + ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec + ; GFX908-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec + ; GFX908-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec + ; GFX908-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec + ; GFX908-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec + ; GFX908-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec + ; GFX908-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec + ; GFX908-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec + ; GFX908-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; ; GFX90A-LABEL: name: v16_to_a16 ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec + ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec + ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec + ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec + ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec + ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec + ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec + ; GFX90A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec + ; GFX90A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec + ; GFX90A-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec + ; GFX90A-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec + ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec + ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; ; GFX942-LABEL: name: v16_to_a16 ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX942-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec + ; GFX942-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec + ; GFX942-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec + ; GFX942-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec + ; GFX942-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec + ; GFX942-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec + ; GFX942-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec + ; GFX942-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec + ; GFX942-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 @@ -583,24 +583,24 @@ body: | ; GFX908-LABEL: name: s2_to_a2 ; GFX908: liveins: $sgpr0_sgpr1 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1 - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1 - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1 ; ; GFX90A-LABEL: name: s2_to_a2 ; GFX90A: liveins: $sgpr0_sgpr1 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $sgpr0_sgpr1 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1 ; ; GFX942-LABEL: name: s2_to_a2 ; GFX942: liveins: $sgpr0_sgpr1 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $sgpr0_sgpr1 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1 $agpr0_agpr1 = COPY killed $sgpr0_sgpr1, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1 @@ -615,28 +615,28 @@ body: | ; GFX908-LABEL: name: s3_to_a3 ; GFX908: liveins: $sgpr0_sgpr1_sgpr2 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ; ; GFX90A-LABEL: name: s3_to_a3 ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $sgpr0_sgpr1_sgpr2 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ; ; GFX942-LABEL: name: s3_to_a3 ; GFX942: liveins: $sgpr0_sgpr1_sgpr2 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $sgpr0_sgpr1_sgpr2 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 $agpr0_agpr1_agpr2 = COPY killed $sgpr0_sgpr1_sgpr2, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 @@ -651,32 +651,32 @@ body: | ; GFX908-LABEL: name: s4_to_a4 ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr3, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 ; ; GFX90A-LABEL: name: s4_to_a4 ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 ; ; GFX942-LABEL: name: s4_to_a4 ; GFX942: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 $agpr0_agpr1_agpr2_agpr3 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 @@ -691,40 +691,40 @@ body: | ; GFX908-LABEL: name: s6_to_a6 ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr3, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 ; ; GFX90A-LABEL: name: s6_to_a6 ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec + ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr4, implicit $exec + ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr5, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 ; ; GFX942-LABEL: name: s6_to_a6 ; GFX942: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr4, implicit $exec + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr5, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 @@ -739,48 +739,48 @@ body: | ; GFX908-LABEL: name: s8_to_a8 ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr3, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr6, implicit $exec ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr7, implicit $exec ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; ; GFX90A-LABEL: name: s8_to_a8 ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec + ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr4, implicit $exec + ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr5, implicit $exec + ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr6, implicit $exec + ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr7, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; ; GFX942-LABEL: name: s8_to_a8 ; GFX942: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr4, implicit $exec + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr5, implicit $exec + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr6, implicit $exec + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr7, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 @@ -795,80 +795,80 @@ body: | ; GFX908-LABEL: name: s16_to_a16 ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr3, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr6, implicit $exec ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr7, implicit $exec ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec ; GFX908-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr9, implicit $exec ; GFX908-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr10, implicit $exec ; GFX908-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr11, implicit $exec ; GFX908-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr12, implicit $exec ; GFX908-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr13, implicit $exec ; GFX908-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr14, implicit $exec ; GFX908-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr15, implicit $exec ; GFX908-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; ; GFX90A-LABEL: name: s16_to_a16 ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec + ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr4, implicit $exec + ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr5, implicit $exec + ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr6, implicit $exec + ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr7, implicit $exec + ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr8, implicit $exec + ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr9, implicit $exec + ; GFX90A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr10, implicit $exec + ; GFX90A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr11, implicit $exec + ; GFX90A-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr12, implicit $exec + ; GFX90A-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr13, implicit $exec + ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr14, implicit $exec + ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr15, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; ; GFX942-LABEL: name: s16_to_a16 ; GFX942: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX942-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr4, implicit $exec + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr5, implicit $exec + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr6, implicit $exec + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr7, implicit $exec + ; GFX942-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr8, implicit $exec + ; GFX942-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr9, implicit $exec + ; GFX942-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr10, implicit $exec + ; GFX942-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr11, implicit $exec + ; GFX942-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr12, implicit $exec + ; GFX942-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr13, implicit $exec + ; GFX942-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr14, implicit $exec + ; GFX942-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr15, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 @@ -908,9 +908,9 @@ body: | ; GFX908-LABEL: name: a2_to_a2 ; GFX908: liveins: $agpr0_agpr1 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1 - ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 @@ -918,16 +918,16 @@ body: | ; GFX90A-LABEL: name: a2_to_a2 ; GFX90A: liveins: $agpr0_agpr1 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr1_agpr2, implicit $agpr0_agpr1 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $exec ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 ; ; GFX942-LABEL: name: a2_to_a2 ; GFX942: liveins: $agpr0_agpr1 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr1_agpr2, implicit $agpr0_agpr1 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $exec ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 $agpr1_agpr2 = COPY $agpr0_agpr1, implicit $exec @@ -944,9 +944,9 @@ body: | ; GFX908-LABEL: name: a2_to_a2_kill ; GFX908: liveins: $agpr0_agpr1 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1 - ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 @@ -954,16 +954,16 @@ body: | ; GFX90A-LABEL: name: a2_to_a2_kill ; GFX90A: liveins: $agpr0_agpr1 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr1_agpr2, implicit $agpr0_agpr1 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $exec ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 ; ; GFX942-LABEL: name: a2_to_a2_kill ; GFX942: liveins: $agpr0_agpr1 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr1_agpr2, implicit $agpr0_agpr1 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $exec ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 $agpr1_agpr2 = COPY killed $agpr0_agpr1, implicit $exec @@ -984,9 +984,9 @@ body: | ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2 ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec, implicit-def $agpr1_agpr2 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr1_agpr2 - ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr3_agpr4 - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit killed $agpr1_agpr2 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: a2_to_a2_implicit_defs @@ -996,8 +996,8 @@ body: | ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2 ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec, implicit-def $agpr1_agpr2 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit-def $agpr3_agpr4, implicit $agpr1_agpr2 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit killed $agpr1_agpr2, implicit $exec + ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 killed $agpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: a2_to_a2_implicit_defs ; GFX942: liveins: $agpr0_agpr1 @@ -1006,8 +1006,8 @@ body: | ; GFX942-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2 ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 ; GFX942-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec, implicit-def $agpr1_agpr2 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit-def $agpr3_agpr4, implicit $agpr1_agpr2 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit killed $agpr1_agpr2, implicit $exec + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 killed $agpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1 $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr1_agpr2 $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 @@ -1024,28 +1024,28 @@ body: | ; GFX908-LABEL: name: a3_to_a3_nonoverlap_kill ; GFX908: liveins: $agpr4_agpr5_agpr6 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr4_agpr5_agpr6 - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ; ; GFX90A-LABEL: name: a3_to_a3_nonoverlap_kill ; GFX90A: liveins: $agpr4_agpr5_agpr6 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr4_agpr5_agpr6 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6 + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr4, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 killed $agpr5, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 killed $agpr6, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ; ; GFX942-LABEL: name: a3_to_a3_nonoverlap_kill ; GFX942: liveins: $agpr4_agpr5_agpr6 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr4_agpr5_agpr6 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6 + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr4, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 killed $agpr5, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 killed $agpr6, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 $agpr0_agpr1_agpr2 = COPY killed $agpr4_agpr5_agpr6 S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 @@ -1060,11 +1060,11 @@ body: | ; GFX908-LABEL: name: a3_to_a3_overlap_kill ; GFX908: liveins: $agpr1_agpr2_agpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1 @@ -1072,18 +1072,18 @@ body: | ; GFX90A-LABEL: name: a3_to_a3_overlap_kill ; GFX90A: liveins: $agpr1_agpr2_agpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3 + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1 ; ; GFX942-LABEL: name: a3_to_a3_overlap_kill ; GFX942: liveins: $agpr1_agpr2_agpr3 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3 + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1 $agpr0_agpr1_agpr2 = COPY killed $agpr1_agpr2_agpr3 @@ -1098,30 +1098,30 @@ body: | bb.0: ; GFX908-LABEL: name: a4_to_a4 ; GFX908: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5 ; ; GFX90A-LABEL: name: a4_to_a4 ; GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec + ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5 ; ; GFX942-LABEL: name: a4_to_a4 ; GFX942: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5 $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF $agpr2_agpr3_agpr4_agpr5 = COPY killed $agpr0_agpr1_agpr2_agpr3, implicit $exec @@ -1137,32 +1137,32 @@ body: | ; GFX908-LABEL: name: a4_to_a4_overlap ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 ; ; GFX90A-LABEL: name: a4_to_a4_overlap ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec + ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 ; ; GFX942-LABEL: name: a4_to_a4_overlap ; GFX942: liveins: $agpr0_agpr1_agpr2_agpr3 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX942-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec + ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec + ; GFX942-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX942-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 $agpr2_agpr3_agpr4_agpr5 = COPY $agpr0_agpr1_agpr2_agpr3, implicit $exec S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 @@ -1175,46 +1175,46 @@ body: | bb.0: ; GFX908-LABEL: name: a8_to_a8 ; GFX908: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec + ; GFX908-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec ; GFX908-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec ; GFX908-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec ; GFX908-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec ; GFX908-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec ; GFX908-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec ; GFX908-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec ; GFX908-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; ; GFX90A-LABEL: name: a8_to_a8 ; GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF - ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $agpr13 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $agpr12 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $agpr11 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $agpr10 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec + ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_MOV_B32 killed $agpr7, implicit $exec + ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_MOV_B32 killed $agpr6, implicit $exec + ; GFX90A-NEXT: $agpr13 = V_ACCVGPR_MOV_B32 killed $agpr5, implicit $exec + ; GFX90A-NEXT: $agpr12 = V_ACCVGPR_MOV_B32 killed $agpr4, implicit $exec + ; GFX90A-NEXT: $agpr11 = V_ACCVGPR_MOV_B32 killed $agpr3, implicit $exec + ; GFX90A-NEXT: $agpr10 = V_ACCVGPR_MOV_B32 killed $agpr2, implicit $exec + ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec + ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_MOV_B32 killed $agpr0, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; ; GFX942-LABEL: name: a8_to_a8 ; GFX942: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF - ; GFX942-NEXT: $agpr15 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $agpr14 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $agpr13 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $agpr12 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $agpr11 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $agpr10 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $agpr9 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX942-NEXT: $agpr8 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec + ; GFX942-NEXT: $agpr15 = V_ACCVGPR_MOV_B32 killed $agpr7, implicit $exec + ; GFX942-NEXT: $agpr14 = V_ACCVGPR_MOV_B32 killed $agpr6, implicit $exec + ; GFX942-NEXT: $agpr13 = V_ACCVGPR_MOV_B32 killed $agpr5, implicit $exec + ; GFX942-NEXT: $agpr12 = V_ACCVGPR_MOV_B32 killed $agpr4, implicit $exec + ; GFX942-NEXT: $agpr11 = V_ACCVGPR_MOV_B32 killed $agpr3, implicit $exec + ; GFX942-NEXT: $agpr10 = V_ACCVGPR_MOV_B32 killed $agpr2, implicit $exec + ; GFX942-NEXT: $agpr9 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec + ; GFX942-NEXT: $agpr8 = V_ACCVGPR_MOV_B32 killed $agpr0, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec @@ -1229,78 +1229,78 @@ body: | ; GFX908-LABEL: name: a16_to_a16 ; GFX908: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec + ; GFX908-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec ; GFX908-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec ; GFX908-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec ; GFX908-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec ; GFX908-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec ; GFX908-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec ; GFX908-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec ; GFX908-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec ; GFX908-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec ; GFX908-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec ; GFX908-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec ; GFX908-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec ; GFX908-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec ; GFX908-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec ; GFX908-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec ; GFX908-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 ; ; GFX90A-LABEL: name: a16_to_a16 ; GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $agpr31 = V_ACCVGPR_MOV_B32 $agpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr30 = V_ACCVGPR_MOV_B32 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr29 = V_ACCVGPR_MOV_B32 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr28 = V_ACCVGPR_MOV_B32 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr27 = V_ACCVGPR_MOV_B32 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr26 = V_ACCVGPR_MOV_B32 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr25 = V_ACCVGPR_MOV_B32 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr24 = V_ACCVGPR_MOV_B32 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr23 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr22 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr21 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr20 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr19 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr18 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr17 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-NEXT: $agpr16 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec + ; GFX90A-NEXT: $agpr31 = V_ACCVGPR_MOV_B32 killed $agpr15, implicit $exec + ; GFX90A-NEXT: $agpr30 = V_ACCVGPR_MOV_B32 killed $agpr14, implicit $exec + ; GFX90A-NEXT: $agpr29 = V_ACCVGPR_MOV_B32 killed $agpr13, implicit $exec + ; GFX90A-NEXT: $agpr28 = V_ACCVGPR_MOV_B32 killed $agpr12, implicit $exec + ; GFX90A-NEXT: $agpr27 = V_ACCVGPR_MOV_B32 killed $agpr11, implicit $exec + ; GFX90A-NEXT: $agpr26 = V_ACCVGPR_MOV_B32 killed $agpr10, implicit $exec + ; GFX90A-NEXT: $agpr25 = V_ACCVGPR_MOV_B32 killed $agpr9, implicit $exec + ; GFX90A-NEXT: $agpr24 = V_ACCVGPR_MOV_B32 killed $agpr8, implicit $exec + ; GFX90A-NEXT: $agpr23 = V_ACCVGPR_MOV_B32 killed $agpr7, implicit $exec + ; GFX90A-NEXT: $agpr22 = V_ACCVGPR_MOV_B32 killed $agpr6, implicit $exec + ; GFX90A-NEXT: $agpr21 = V_ACCVGPR_MOV_B32 killed $agpr5, implicit $exec + ; GFX90A-NEXT: $agpr20 = V_ACCVGPR_MOV_B32 killed $agpr4, implicit $exec + ; GFX90A-NEXT: $agpr19 = V_ACCVGPR_MOV_B32 killed $agpr3, implicit $exec + ; GFX90A-NEXT: $agpr18 = V_ACCVGPR_MOV_B32 killed $agpr2, implicit $exec + ; GFX90A-NEXT: $agpr17 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec + ; GFX90A-NEXT: $agpr16 = V_ACCVGPR_MOV_B32 killed $agpr0, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 ; ; GFX942-LABEL: name: a16_to_a16 ; GFX942: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF - ; GFX942-NEXT: $agpr31 = V_ACCVGPR_MOV_B32 $agpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr30 = V_ACCVGPR_MOV_B32 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr29 = V_ACCVGPR_MOV_B32 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr28 = V_ACCVGPR_MOV_B32 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr27 = V_ACCVGPR_MOV_B32 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr26 = V_ACCVGPR_MOV_B32 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr25 = V_ACCVGPR_MOV_B32 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr24 = V_ACCVGPR_MOV_B32 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr23 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr22 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr21 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr20 = V_ACCVGPR_MOV_B32 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr19 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr18 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr17 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX942-NEXT: $agpr16 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec + ; GFX942-NEXT: $agpr31 = V_ACCVGPR_MOV_B32 killed $agpr15, implicit $exec + ; GFX942-NEXT: $agpr30 = V_ACCVGPR_MOV_B32 killed $agpr14, implicit $exec + ; GFX942-NEXT: $agpr29 = V_ACCVGPR_MOV_B32 killed $agpr13, implicit $exec + ; GFX942-NEXT: $agpr28 = V_ACCVGPR_MOV_B32 killed $agpr12, implicit $exec + ; GFX942-NEXT: $agpr27 = V_ACCVGPR_MOV_B32 killed $agpr11, implicit $exec + ; GFX942-NEXT: $agpr26 = V_ACCVGPR_MOV_B32 killed $agpr10, implicit $exec + ; GFX942-NEXT: $agpr25 = V_ACCVGPR_MOV_B32 killed $agpr9, implicit $exec + ; GFX942-NEXT: $agpr24 = V_ACCVGPR_MOV_B32 killed $agpr8, implicit $exec + ; GFX942-NEXT: $agpr23 = V_ACCVGPR_MOV_B32 killed $agpr7, implicit $exec + ; GFX942-NEXT: $agpr22 = V_ACCVGPR_MOV_B32 killed $agpr6, implicit $exec + ; GFX942-NEXT: $agpr21 = V_ACCVGPR_MOV_B32 killed $agpr5, implicit $exec + ; GFX942-NEXT: $agpr20 = V_ACCVGPR_MOV_B32 killed $agpr4, implicit $exec + ; GFX942-NEXT: $agpr19 = V_ACCVGPR_MOV_B32 killed $agpr3, implicit $exec + ; GFX942-NEXT: $agpr18 = V_ACCVGPR_MOV_B32 killed $agpr2, implicit $exec + ; GFX942-NEXT: $agpr17 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec + ; GFX942-NEXT: $agpr16 = V_ACCVGPR_MOV_B32 killed $agpr0, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec @@ -1353,37 +1353,29 @@ body: | ; GFX908: liveins: $agpr0, $sgpr2_sgpr3 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec + ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr2, implicit $exec + ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; ; GFX90A-LABEL: name: copy_sgpr_to_agpr_tuple ; GFX90A: liveins: $agpr0, $sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec + ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; ; GFX942-LABEL: name: copy_sgpr_to_agpr_tuple ; GFX942: liveins: $agpr0, $sgpr2_sgpr3 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 S_NOP 0, implicit-def dead $sgpr0_sgpr1 - renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable $sgpr0_sgpr1_sgpr2_sgpr3, 240, implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ... --- @@ -1397,37 +1389,29 @@ body: | ; GFX908: liveins: $agpr0, $sgpr2_sgpr3 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec + ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 ; ; GFX90A-LABEL: name: copy_sgpr_to_agpr_tuple_kill ; GFX90A: liveins: $agpr0, $sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec + ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 ; ; GFX942-LABEL: name: copy_sgpr_to_agpr_tuple_kill ; GFX942: liveins: $agpr0, $sgpr2_sgpr3 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr3, implicit $exec + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 S_NOP 0, implicit-def dead $sgpr0_sgpr1 - renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec + renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $sgpr0_sgpr1_sgpr2_sgpr3, 240, implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 ... @@ -1442,37 +1426,29 @@ body: | ; GFX908: liveins: $agpr0, $agpr2_agpr3 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 ; ; GFX90A-LABEL: name: copy_agpr_to_agpr_tuple ; GFX90A: liveins: $agpr0, $agpr2_agpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec + ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 ; ; GFX942-LABEL: name: copy_agpr_to_agpr_tuple ; GFX942: liveins: $agpr0, $agpr2_agpr3 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GFX942-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 S_NOP 0, implicit-def dead $agpr0_agpr1 - renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec + renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable $agpr0_agpr1_agpr2_agpr3, 240, implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 ... @@ -1487,37 +1463,29 @@ body: | ; GFX908: liveins: $agpr0, $agpr2_agpr3 ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec + ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 ; ; GFX90A-LABEL: name: copy_agpr_to_agpr_tuple_kill ; GFX90A: liveins: $agpr0, $agpr2_agpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 killed $agpr3, implicit $exec + ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 killed $agpr2, implicit $exec, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 ; ; GFX942-LABEL: name: copy_agpr_to_agpr_tuple_kill ; GFX942: liveins: $agpr0, $agpr2_agpr3 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GFX942-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX942-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX942-NEXT: $agpr7 = V_ACCVGPR_MOV_B32 killed $agpr3, implicit $exec + ; GFX942-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 killed $agpr2, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 S_NOP 0, implicit-def dead $agpr0_agpr1 - renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $agpr0_agpr1_agpr2_agpr3, implicit $exec + renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $agpr0_agpr1_agpr2_agpr3, 240, implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 ... diff --git a/llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir b/llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir index 9376a4c59c170..644c8641c606a 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir +++ b/llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir @@ -14,11 +14,10 @@ body: | ; CHECK-LABEL: name: copy_has_implicit_kill_superreg ; CHECK: renamable $vgpr7_vgpr8_vgpr9_vgpr10 = IMPLICIT_DEF - ; CHECK-NEXT: $vgpr7 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit-def $vgpr7_vgpr8, implicit $vgpr10_vgpr11 - ; CHECK-NEXT: $vgpr8 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr10_vgpr11, implicit $vgpr7_vgpr8_vgpr9_vgpr10 + ; CHECK-NEXT: $vgpr7 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr7 renamable $vgpr7_vgpr8_vgpr9_vgpr10 = IMPLICIT_DEF - renamable $vgpr7_vgpr8 = COPY killed renamable $vgpr10_vgpr11, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10 + renamable $vgpr7_vgpr8 = COPY killed renamable $vgpr10_vgpr11, 3, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10 S_ENDPGM 0, implicit $vgpr7 ... diff --git a/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir index cc976fe13c47c..99a3daa2d05fc 100644 --- a/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir +++ b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir @@ -15,13 +15,13 @@ body: | ; GFX908-LABEL: name: copy_v64_to_v64 ; GFX908: liveins: $vgpr2_vgpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr2, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr3, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v64_to_v64 ; GFX90A: liveins: $vgpr2_vgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, killed $vgpr2_vgpr3, 12, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v64_to_v64 ; GFX942: liveins: $vgpr2_vgpr3 @@ -31,8 +31,9 @@ body: | ; GFX10-LABEL: name: copy_v64_to_v64 ; GFX10: liveins: $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr2, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr3, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v64_to_v64 ; GFX1250: liveins: $vgpr2_vgpr3 ; GFX1250-NEXT: {{ $}} @@ -49,13 +50,13 @@ body: | ; GFX908-LABEL: name: copy_s64_to_v64 ; GFX908: liveins: $sgpr2_sgpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_s64_to_v64 ; GFX90A: liveins: $sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr2_sgpr3, 12, $sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, killed $sgpr2_sgpr3, 12, killed $sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_s64_to_v64 ; GFX942: liveins: $sgpr2_sgpr3 @@ -65,8 +66,9 @@ body: | ; GFX10-LABEL: name: copy_s64_to_v64 ; GFX10: liveins: $sgpr2_sgpr3 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_s64_to_v64 ; GFX1250: liveins: $sgpr2_sgpr3 ; GFX1250-NEXT: {{ $}} @@ -83,31 +85,32 @@ body: | ; GFX908-LABEL: name: copy_a64_to_v64 ; GFX908: liveins: $agpr2_agpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_a64_to_v64 ; GFX90A: liveins: $agpr2_agpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 - ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_a64_to_v64 ; GFX942: liveins: $agpr2_agpr3 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 - ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_a64_to_v64 ; GFX10: liveins: $agpr2_agpr3 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 - ; GFX10-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_a64_to_v64 ; GFX1250: liveins: $agpr2_agpr3 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 - ; GFX1250-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec $vgpr0_vgpr1 = COPY killed $agpr2_agpr3, implicit $exec ... @@ -120,35 +123,36 @@ body: | ; GFX908-LABEL: name: copy_v128_to_v128_fwd ; GFX908: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v128_to_v128_fwd ; GFX90A: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr4_vgpr5, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr4_vgpr5, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v128_to_v128_fwd ; GFX942: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr4_vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr4_vgpr5, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v128_to_v128_fwd ; GFX10: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec + ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v128_to_v128_fwd ; GFX1250: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX1250-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr4_vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec + ; GFX1250-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec + ; GFX1250-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr4_vgpr5, implicit $exec, implicit $exec $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec ... @@ -161,35 +165,36 @@ body: | ; GFX908-LABEL: name: copy_v128_to_v128_back ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX908-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v128_to_v128_back ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v128_to_v128_back ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr0_vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr0_vgpr1, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v128_to_v128_back ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX10-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX10-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; GFX10-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v128_to_v128_back ; GFX1250: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX1250-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr0_vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX1250-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec + ; GFX1250-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr0_vgpr1, implicit $exec, implicit $exec $vgpr2_vgpr3_vgpr4_vgpr5 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ... @@ -202,36 +207,37 @@ body: | ; GFX908-LABEL: name: copy_v96_to_v96 ; GFX908: liveins: $vgpr4_vgpr5_vgpr6 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr5, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr6, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v96_to_v96 ; GFX90A: liveins: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6 - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr5, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr6, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v96_to_v96 ; GFX942: liveins: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6 - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr5, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr6, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v96_to_v96 ; GFX10: liveins: $vgpr4_vgpr5_vgpr6 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr5, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr6, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v96_to_v96 ; GFX1250: liveins: $vgpr4_vgpr5_vgpr6 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6 - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr5, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr6, implicit $exec, implicit $exec $vgpr0_vgpr1_vgpr2 = COPY killed $vgpr4_vgpr5_vgpr6, implicit $exec ... @@ -244,13 +250,12 @@ body: | ; GFX908-LABEL: name: copy_v64_to_v64_undef_sub0 ; GFX908: liveins: $vgpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr3, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub0 ; GFX90A: liveins: $vgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, killed $vgpr2_vgpr3, 12, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v64_to_v64_undef_sub0 ; GFX942: liveins: $vgpr3 @@ -260,13 +265,13 @@ body: | ; GFX10-LABEL: name: copy_v64_to_v64_undef_sub0 ; GFX10: liveins: $vgpr3 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr3, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v64_to_v64_undef_sub0 ; GFX1250: liveins: $vgpr3 ; GFX1250-NEXT: {{ $}} ; GFX1250-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec - $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, implicit $exec + $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, 12, implicit $exec ... --- @@ -278,13 +283,12 @@ body: | ; GFX908-LABEL: name: copy_v64_to_v64_undef_sub1 ; GFX908: liveins: $vgpr2 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr2, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub1 ; GFX90A: liveins: $vgpr2 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, killed $vgpr2_vgpr3, 12, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v64_to_v64_undef_sub1 ; GFX942: liveins: $vgpr2 @@ -294,13 +298,13 @@ body: | ; GFX10-LABEL: name: copy_v64_to_v64_undef_sub1 ; GFX10: liveins: $vgpr2 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr2, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v64_to_v64_undef_sub1 ; GFX1250: liveins: $vgpr2 ; GFX1250-NEXT: {{ $}} ; GFX1250-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec - $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, implicit $exec + $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, 3, implicit $exec ... --- @@ -312,35 +316,36 @@ body: | ; GFX908-LABEL: name: copy_s128_to_v128_killed ; GFX908: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr6, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr7, implicit $exec ; ; GFX90A-LABEL: name: copy_s128_to_v128_killed ; GFX90A: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr4_sgpr5, 12, $sgpr4_sgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $sgpr6_sgpr7, 12, $sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, killed $sgpr4_sgpr5, 12, killed $sgpr4_sgpr5, 0, 0, 0, 0, 0, implicit $exec + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, killed $sgpr6_sgpr7, 12, killed $sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec ; ; GFX942-LABEL: name: copy_s128_to_v128_killed ; GFX942: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr4_sgpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $sgpr6_sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr4_sgpr5, implicit $exec + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr6_sgpr7, implicit $exec ; ; GFX10-LABEL: name: copy_s128_to_v128_killed ; GFX10: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr6, implicit $exec + ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr7, implicit $exec + ; ; GFX1250-LABEL: name: copy_s128_to_v128_killed ; GFX1250: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr4_sgpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX1250-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $sgpr6_sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX1250-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr4_sgpr5, implicit $exec + ; GFX1250-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr6_sgpr7, implicit $exec $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $sgpr4_sgpr5_sgpr6_sgpr7 ... @@ -353,31 +358,32 @@ body: | ; GFX908-LABEL: name: copy_v64_to_v64_unaligned ; GFX908: liveins: $vgpr2_vgpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v64_to_v64_unaligned ; GFX90A: liveins: $vgpr2_vgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v64_to_v64_unaligned ; GFX942: liveins: $vgpr2_vgpr3 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v64_to_v64_unaligned ; GFX10: liveins: $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v64_to_v64_unaligned ; GFX1250: liveins: $vgpr2_vgpr3 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $exec $vgpr1_vgpr2 = COPY killed $vgpr2_vgpr3, implicit $exec ... @@ -390,31 +396,32 @@ body: | ; GFX908-LABEL: name: copy_v64_unaligned_to_v64 ; GFX908: liveins: $vgpr3_vgpr4 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr3, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v64_unaligned_to_v64 ; GFX90A: liveins: $vgpr3_vgpr4 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr3, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v64_unaligned_to_v64 ; GFX942: liveins: $vgpr3_vgpr4 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr3, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v64_unaligned_to_v64 ; GFX10: liveins: $vgpr3_vgpr4 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr3, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v64_unaligned_to_v64 ; GFX1250: liveins: $vgpr3_vgpr4 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr3, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec $vgpr0_vgpr1 = COPY killed $vgpr3_vgpr4, implicit $exec ... @@ -427,41 +434,42 @@ body: | ; GFX908-LABEL: name: copy_v128_to_v128_unaligned ; GFX908: liveins: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec + ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 killed $vgpr11, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v128_to_v128_unaligned ; GFX90A: liveins: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $vgpr11, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v128_to_v128_unaligned ; GFX942: liveins: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $vgpr11, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v128_to_v128_unaligned ; GFX10: liveins: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX10-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec + ; GFX10-NEXT: $vgpr4 = V_MOV_B32_e32 killed $vgpr11, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v128_to_v128_unaligned ; GFX1250: liveins: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX1250-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec + ; GFX1250-NEXT: $vgpr4 = V_MOV_B32_e32 killed $vgpr11, implicit $exec, implicit $exec $vgpr1_vgpr2_vgpr3_vgpr4 = COPY killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec ... @@ -474,41 +482,42 @@ body: | ; GFX908-LABEL: name: copy_v128_unaligned_to_v128 ; GFX908: liveins: $vgpr7_vgpr8_vgpr9_vgpr10 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v128_unaligned_to_v128 ; GFX90A: liveins: $vgpr7_vgpr8_vgpr9_vgpr10 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v128_unaligned_to_v128 ; GFX942: liveins: $vgpr7_vgpr8_vgpr9_vgpr10 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v128_unaligned_to_v128 ; GFX10: liveins: $vgpr7_vgpr8_vgpr9_vgpr10 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v128_unaligned_to_v128 ; GFX1250: liveins: $vgpr7_vgpr8_vgpr9_vgpr10 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec ... @@ -521,31 +530,32 @@ body: | ; GFX908-LABEL: name: copy_s64_to_v64_unaligned ; GFX908: liveins: $sgpr8_sgpr9 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_s64_to_v64_unaligned ; GFX90A: liveins: $sgpr8_sgpr9 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_s64_to_v64_unaligned ; GFX942: liveins: $sgpr8_sgpr9 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_s64_to_v64_unaligned ; GFX10: liveins: $sgpr8_sgpr9 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_s64_to_v64_unaligned ; GFX1250: liveins: $sgpr8_sgpr9 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec, implicit $exec $vgpr1_vgpr2 = COPY killed $sgpr8_sgpr9, implicit $exec ... @@ -558,41 +568,42 @@ body: | ; GFX908-LABEL: name: copy_s128_to_v128_unaligned ; GFX908: liveins: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr10, implicit $exec + ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_s128_to_v128_unaligned ; GFX90A: liveins: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec + ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr10, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_s128_to_v128_unaligned ; GFX942: liveins: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec + ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr10, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_s128_to_v128_unaligned ; GFX10: liveins: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX10-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec + ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr10, implicit $exec + ; GFX10-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_s128_to_v128_unaligned ; GFX1250: liveins: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX1250-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr8, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec + ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr10, implicit $exec + ; GFX1250-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $exec $vgpr1_vgpr2_vgpr3_vgpr4 = COPY killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec ... @@ -605,36 +616,37 @@ body: | ; GFX908-LABEL: name: copy_v96_to_v96_unaligned ; GFX908: liveins: $vgpr8_vgpr9_vgpr10 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 - ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v96_to_v96_unaligned ; GFX90A: liveins: $vgpr8_vgpr9_vgpr10 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 - ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v96_to_v96_unaligned ; GFX942: liveins: $vgpr8_vgpr9_vgpr10 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 - ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v96_to_v96_unaligned ; GFX10: liveins: $vgpr8_vgpr9_vgpr10 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 - ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v96_to_v96_unaligned ; GFX1250: liveins: $vgpr8_vgpr9_vgpr10 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 - ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec + ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec $vgpr1_vgpr2_vgpr3 = COPY killed $vgpr8_vgpr9_vgpr10, implicit $exec ... @@ -647,36 +659,37 @@ body: | ; GFX908-LABEL: name: copy_v96_unaligned_to_v96 ; GFX908: liveins: $vgpr7_vgpr8_vgpr9 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_v96_unaligned_to_v96 ; GFX90A: liveins: $vgpr7_vgpr8_vgpr9 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9 - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_v96_unaligned_to_v96 ; GFX942: liveins: $vgpr7_vgpr8_vgpr9 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9 - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_v96_unaligned_to_v96 ; GFX10: liveins: $vgpr7_vgpr8_vgpr9 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_v96_unaligned_to_v96 ; GFX1250: liveins: $vgpr7_vgpr8_vgpr9 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9 - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr7, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr8, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec, implicit $exec $vgpr0_vgpr1_vgpr2 = COPY killed $vgpr7_vgpr8_vgpr9, implicit $exec ... @@ -689,36 +702,37 @@ body: | ; GFX908-LABEL: name: copy_s96_to_v96 ; GFX908: liveins: $sgpr0_sgpr1_sgpr2 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_s96_to_v96 ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2 - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_s96_to_v96 ; GFX942: liveins: $sgpr0_sgpr1_sgpr2 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2 - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_s96_to_v96 ; GFX10: liveins: $sgpr0_sgpr1_sgpr2 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_s96_to_v96 ; GFX1250: liveins: $sgpr0_sgpr1_sgpr2 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2 - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0_vgpr1_vgpr2 = COPY killed $sgpr0_sgpr1_sgpr2, implicit $exec ... @@ -731,35 +745,36 @@ body: | ; GFX908-LABEL: name: copy_s96_to_v96_unaligned ; GFX908: liveins: $sgpr0_sgpr1_sgpr2 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2 - ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec ; ; GFX90A-LABEL: name: copy_s96_to_v96_unaligned ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2 - ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec ; ; GFX942-LABEL: name: copy_s96_to_v96_unaligned ; GFX942: liveins: $sgpr0_sgpr1_sgpr2 ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2 - ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec ; ; GFX10-LABEL: name: copy_s96_to_v96_unaligned ; GFX10: liveins: $sgpr0_sgpr1_sgpr2 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2 - ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX10-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; ; GFX1250-LABEL: name: copy_s96_to_v96_unaligned ; GFX1250: liveins: $sgpr0_sgpr1_sgpr2 ; GFX1250-NEXT: {{ $}} - ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2 - ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX1250-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec $vgpr1_vgpr2_vgpr3 = COPY killed $sgpr0_sgpr1_sgpr2, implicit $exec ... From ef8e4f7fd520393dd6db35f4d4a2992b1213d89d Mon Sep 17 00:00:00 2001 From: vikashgu Date: Mon, 28 Jul 2025 10:07:52 +0000 Subject: [PATCH 3/4] [LIT] Updated the regressing LIT tests to accomodate patch changes. --- llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll | 6 +- llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll | 6 +- .../atomic_optimizations_mul_one.ll | 12 +- .../AMDGPU/GlobalISel/atomicrmw_fmax.ll | 25 +- .../AMDGPU/GlobalISel/atomicrmw_fmin.ll | 25 +- .../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll | 158 +- .../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll | 228 +- .../AMDGPU/GlobalISel/bitcast_38_i16.ll | 6 +- .../AMDGPU/GlobalISel/cvt_f32_ubyte.ll | 24 +- ...vergence-divergent-i1-used-outside-loop.ll | 6 +- .../GlobalISel/divergence-structurizer.ll | 30 +- .../divergence-temporal-divergent-i1.ll | 2 +- .../AMDGPU/GlobalISel/extractelement.i128.ll | 10 +- .../AMDGPU/GlobalISel/extractelement.i16.ll | 20 +- .../AMDGPU/GlobalISel/extractelement.i8.ll | 42 +- .../AMDGPU/GlobalISel/extractelement.ll | 10 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll | 4 +- .../GlobalISel/flat-scratch-init.gfx.ll | 7 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll | 4 +- ...licit-kernarg-backend-usage-global-isel.ll | 37 +- .../AMDGPU/GlobalISel/insertelement.i16.ll | 155 +- .../AMDGPU/GlobalISel/insertelement.i8.ll | 72 +- .../AMDGPU/GlobalISel/insertelement.ll | 1044 ++++---- .../GlobalISel/inst-select-copy-scc-vcc.ll | 2 +- .../AMDGPU/GlobalISel/lds-global-value.ll | 2 +- .../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll | 40 +- .../GlobalISel/llvm.amdgcn.div.scale.ll | 12 +- .../GlobalISel/llvm.amdgcn.image.load.2d.ll | 46 +- .../llvm.amdgcn.image.load.2darraymsaa.a16.ll | 12 +- .../llvm.amdgcn.image.load.2darraymsaa.ll | 50 +- .../llvm.amdgcn.image.load.3d.a16.ll | 60 +- .../GlobalISel/llvm.amdgcn.image.load.3d.ll | 16 +- .../GlobalISel/llvm.amdgcn.image.store.2d.ll | 4 +- .../GlobalISel/llvm.amdgcn.intersect_ray.ll | 82 +- .../GlobalISel/llvm.amdgcn.mfma.gfx90a.ll | 14 +- .../AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll | 7 +- .../GlobalISel/llvm.amdgcn.update.dpp.ll | 4 +- .../AMDGPU/GlobalISel/load-unaligned.ll | 10 +- .../AMDGPU/GlobalISel/load-uniform-in-vgpr.ll | 303 ++- .../CodeGen/AMDGPU/GlobalISel/mubuf-global.ll | 22 +- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 2 +- llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll | 17 +- .../AMDGPU/GlobalISel/regbankselect-mui.ll | 8 +- .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 86 +- .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 43 +- .../AMDGPU/GlobalISel/shl-ext-reduce.ll | 8 +- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 86 +- llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll | 6 +- llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll | 6 +- .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 35 +- .../GlobalISel/widen-i8-i16-scalar-loads.ll | 4 +- .../AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll | 48 +- .../AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll | 51 +- .../CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll | 4 +- .../test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll | 316 +-- .../CodeGen/AMDGPU/a-v-global-atomicrmw.ll | 115 +- .../abi-attribute-hints-undefined-behavior.ll | 4 +- llvm/test/CodeGen/AMDGPU/add.ll | 38 +- .../AMDGPU/agpr-copy-no-free-registers.ll | 12 +- .../CodeGen/AMDGPU/agpr-copy-no-vgprs.mir | 10 +- .../CodeGen/AMDGPU/agpr-copy-reuse-writes.mir | 22 +- .../AMDGPU/agpr-copy-sgpr-no-vgprs.mir | 10 +- llvm/test/CodeGen/AMDGPU/agpr-csr.ll | 48 +- llvm/test/CodeGen/AMDGPU/always-uniform.ll | 2 +- .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 28 +- .../CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll | 1 - .../test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll | 4 +- .../AMDGPU/amdgpu-codegenprepare-idiv.ll | 4 +- .../test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll | 40 +- .../AMDGPU/amdgpu-cs-chain-preserve-cc.ll | 20 +- llvm/test/CodeGen/AMDGPU/and.ll | 12 +- .../CodeGen/AMDGPU/any_extend_vector_inreg.ll | 56 +- .../atomic_optimizations_global_pointer.ll | 382 +-- .../atomic_optimizations_local_pointer.ll | 102 +- .../CodeGen/AMDGPU/atomicrmw_usub_cond.ll | 4 +- llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll | 6 +- .../AMDGPU/av-split-dead-valno-crash.ll | 4 +- llvm/test/CodeGen/AMDGPU/bitreverse.ll | 16 +- ...der-no-live-segment-at-def-implicit-def.ll | 6 +- .../buffer-fat-pointer-atomicrmw-fadd.ll | 337 +-- .../buffer-fat-pointer-atomicrmw-fmax.ll | 274 +-- .../buffer-fat-pointer-atomicrmw-fmin.ll | 274 +-- .../buffer-fat-pointer-atomicrmw-usub_cond.ll | 20 +- .../CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll | 4 +- llvm/test/CodeGen/AMDGPU/build_vector.ll | 2 +- .../CodeGen/AMDGPU/call-argument-types.ll | 644 ++--- .../CodeGen/AMDGPU/calling-conventions.ll | 12 +- llvm/test/CodeGen/AMDGPU/cluster_stores.ll | 22 +- .../codegen-prepare-addrspacecast-non-null.ll | 3 +- llvm/test/CodeGen/AMDGPU/collapse-endcf.ll | 8 +- .../CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir | 12 +- .../CodeGen/AMDGPU/copy-overlap-vgpr-kill.mir | 26 +- llvm/test/CodeGen/AMDGPU/ctlz.ll | 2 +- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 52 +- llvm/test/CodeGen/AMDGPU/ctpop64.ll | 6 +- llvm/test/CodeGen/AMDGPU/cttz.ll | 2 +- llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 68 +- llvm/test/CodeGen/AMDGPU/dag-divergence.ll | 2 +- llvm/test/CodeGen/AMDGPU/div_i128.ll | 69 +- llvm/test/CodeGen/AMDGPU/ds_read2.ll | 28 +- llvm/test/CodeGen/AMDGPU/ds_write2.ll | 4 +- llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll | 4 +- ...cannot-create-empty-or-backward-segment.ll | 2 +- .../CodeGen/AMDGPU/extract_vector_dynelt.ll | 26 +- .../CodeGen/AMDGPU/extract_vector_elt-i8.ll | 14 +- llvm/test/CodeGen/AMDGPU/fabs.bf16.ll | 4 +- llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 4 +- llvm/test/CodeGen/AMDGPU/fabs.ll | 2 +- .../fast-unaligned-load-store.global.ll | 18 +- llvm/test/CodeGen/AMDGPU/fcanonicalize.ll | 38 +- llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll | 26 +- llvm/test/CodeGen/AMDGPU/fdiv.f16.ll | 12 +- llvm/test/CodeGen/AMDGPU/fdiv.ll | 20 +- .../CodeGen/AMDGPU/fence-lds-read2-write2.ll | 4 +- llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 4 +- llvm/test/CodeGen/AMDGPU/flat_atomics.ll | 50 +- .../CodeGen/AMDGPU/flat_atomics_i32_system.ll | 96 +- llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll | 428 ++-- .../AMDGPU/flat_atomics_i64_noprivate.ll | 772 +++--- .../CodeGen/AMDGPU/flat_atomics_i64_system.ll | 121 +- .../flat_atomics_i64_system_noprivate.ll | 1807 ++++++++------ llvm/test/CodeGen/AMDGPU/fmaximum.ll | 2 +- llvm/test/CodeGen/AMDGPU/fmed3.ll | 2 +- llvm/test/CodeGen/AMDGPU/fminimum.ll | 2 +- .../AMDGPU/fmul-2-combine-multi-use.ll | 55 +- llvm/test/CodeGen/AMDGPU/fnearbyint.ll | 6 +- llvm/test/CodeGen/AMDGPU/fneg-combines.ll | 8 +- llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 6 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll | 8 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 6 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll | 6 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.ll | 4 +- .../CodeGen/AMDGPU/fneg-modifier-casting.ll | 2 +- llvm/test/CodeGen/AMDGPU/fneg.bf16.ll | 9 +- llvm/test/CodeGen/AMDGPU/fneg.f16.ll | 2 +- llvm/test/CodeGen/AMDGPU/fneg.ll | 4 +- llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll | 8 +- llvm/test/CodeGen/AMDGPU/fp-classify.ll | 28 +- llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 12 +- llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll | 2 +- llvm/test/CodeGen/AMDGPU/frem.ll | 1378 +---------- llvm/test/CodeGen/AMDGPU/fshl.ll | 9 +- llvm/test/CodeGen/AMDGPU/fshr.ll | 436 +--- .../AMDGPU/gfx-callable-return-types.ll | 4 +- .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 76 +- .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 61 +- .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 61 +- .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 65 +- llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 4 +- llvm/test/CodeGen/AMDGPU/global_atomics.ll | 32 +- .../AMDGPU/global_atomics_i32_system.ll | 44 +- .../test/CodeGen/AMDGPU/global_atomics_i64.ll | 112 +- .../AMDGPU/global_atomics_i64_system.ll | 123 +- .../AMDGPU/global_atomics_scan_fadd.ll | 140 +- .../AMDGPU/global_atomics_scan_fmax.ll | 72 +- .../AMDGPU/global_atomics_scan_fmin.ll | 72 +- .../AMDGPU/global_atomics_scan_fsub.ll | 140 +- .../greedy-alloc-fail-sgpr1024-spill.mir | 4 +- llvm/test/CodeGen/AMDGPU/half.ll | 244 +- .../identical-subrange-spill-infloop.ll | 5 +- .../AMDGPU/implicit-kernarg-backend-usage.ll | 8 +- .../CodeGen/AMDGPU/indirect-addressing-si.ll | 166 +- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 88 +- llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 8 +- .../issue130120-eliminate-frame-index.ll | 2 +- llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll | 4 +- llvm/test/CodeGen/AMDGPU/itofp.i128.ll | 47 +- llvm/test/CodeGen/AMDGPU/kernel-args.ll | 105 +- .../AMDGPU/kernel-argument-dag-lowering.ll | 10 +- .../llvm.amdgcn.cvt.scalef32.pk.gfx950.ll | 16 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll | 19 +- .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll | 128 +- .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll | 102 +- .../AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll | 2 +- .../AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll | 2 +- .../CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll | 198 +- .../AMDGPU/llvm.amdgcn.image.msaa.load.ll | 47 +- .../llvm.amdgcn.image.sample.d16.dim.ll | 10 +- .../AMDGPU/llvm.amdgcn.image.sample.dim.ll | 20 +- .../AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll | 4 +- .../AMDGPU/llvm.amdgcn.intersect_ray.ll | 103 +- .../AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll | 4 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll | 20 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll | 120 +- .../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll | 269 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 2169 +++++++++-------- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 84 +- ....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 325 +-- ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 660 ++--- .../llvm.amdgcn.pops.exiting.wave.id.ll | 68 +- .../CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll | 2 +- .../AMDGPU/llvm.amdgcn.readfirstlane.ll | 44 +- .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 79 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll | 14 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll | 12 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll | 2 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll | 2 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll | 12 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll | 12 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll | 12 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll | 16 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 12 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll | 12 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll | 1233 ++-------- .../llvm.amdgcn.sched.group.barrier.gfx11.ll | 120 +- .../llvm.amdgcn.sched.group.barrier.gfx12.ll | 48 +- .../CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll | 142 +- .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 397 +-- .../llvm.amdgcn.wmma.imm.gfx1250.w32.ll | 276 +-- .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 44 +- llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 12 +- llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 12 +- llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 20 +- .../CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll | 2 +- .../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll | 4 +- llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll | 4 +- llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 12 +- llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll | 22 +- llvm/test/CodeGen/AMDGPU/load-constant-f64.ll | 4 +- llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 498 ++-- llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 611 ++--- llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 319 +-- llvm/test/CodeGen/AMDGPU/load-constant-i64.ll | 75 +- llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 631 ++--- llvm/test/CodeGen/AMDGPU/load-global-f32.ll | 58 +- llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 466 ++-- llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 324 +-- llvm/test/CodeGen/AMDGPU/load-global-i8.ll | 437 ++-- llvm/test/CodeGen/AMDGPU/load-select-ptr.ll | 91 +- .../lower-work-group-id-intrinsics-hsa.ll | 20 +- .../AMDGPU/lower-work-group-id-intrinsics.ll | 4 +- llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 8 +- .../CodeGen/AMDGPU/max-hard-clause-length.ll | 8 +- .../CodeGen/AMDGPU/memcpy-crash-issue63986.ll | 2 +- llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 16 +- llvm/test/CodeGen/AMDGPU/memmove-var-size.ll | 45 +- llvm/test/CodeGen/AMDGPU/memory_clause.ll | 6 +- llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll | 146 +- llvm/test/CodeGen/AMDGPU/mfma-loop.ll | 794 +----- .../AMDGPU/mfma-no-register-aliasing.ll | 2 +- llvm/test/CodeGen/AMDGPU/min.ll | 36 +- .../AMDGPU/module-lds-false-sharing.ll | 14 +- .../AMDGPU/no-folding-imm-to-inst-with-fi.ll | 4 +- llvm/test/CodeGen/AMDGPU/or.ll | 16 +- .../AMDGPU/pal-simple-indirect-call.ll | 2 +- llvm/test/CodeGen/AMDGPU/permute.ll | 2 +- .../AMDGPU/promote-constOffset-to-imm.ll | 20 +- llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll | 8 +- .../ran-out-of-sgprs-allocation-failure.mir | 4 +- llvm/test/CodeGen/AMDGPU/rem_i128.ll | 6 +- .../remaining-virtual-register-operands.mir | 1 + .../AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll | 9 +- .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll | 4 +- llvm/test/CodeGen/AMDGPU/rotl.ll | 5 +- llvm/test/CodeGen/AMDGPU/rotr.ll | 226 +- llvm/test/CodeGen/AMDGPU/sad.ll | 13 +- llvm/test/CodeGen/AMDGPU/saddo.ll | 8 +- llvm/test/CodeGen/AMDGPU/sdiv64.ll | 17 +- llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 8 +- llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir | 224 +- .../sgpr-spill-update-only-slot-indexes.ll | 6 +- .../CodeGen/AMDGPU/shift-and-i128-ubfe.ll | 4 +- .../test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll | 14 +- .../AMDGPU/shufflevector.v2i64.v8i64.ll | 40 +- .../CodeGen/AMDGPU/simple-indirect-call.ll | 8 +- llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll | 16 +- llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll | 2 +- llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll | 2 +- llvm/test/CodeGen/AMDGPU/spill-agpr.ll | 12 +- .../CodeGen/AMDGPU/spill-scavenge-offset.ll | 132 +- llvm/test/CodeGen/AMDGPU/srem.ll | 19 +- llvm/test/CodeGen/AMDGPU/srem64.ll | 13 +- llvm/test/CodeGen/AMDGPU/ssubo.ll | 8 +- ...tack-pointer-offset-relative-frameindex.ll | 4 +- .../CodeGen/AMDGPU/stacksave_stackrestore.ll | 12 +- llvm/test/CodeGen/AMDGPU/store-local.128.ll | 19 +- llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 4 +- llvm/test/CodeGen/AMDGPU/structurize-hoist.ll | 2 +- llvm/test/CodeGen/AMDGPU/sub.ll | 13 +- .../AMDGPU/subreg-coalescer-undef-use.ll | 9 +- llvm/test/CodeGen/AMDGPU/swdev380865.ll | 9 +- llvm/test/CodeGen/AMDGPU/trap-abis.ll | 4 +- llvm/test/CodeGen/AMDGPU/trunc-store.ll | 4 +- llvm/test/CodeGen/AMDGPU/trunc.ll | 2 +- llvm/test/CodeGen/AMDGPU/udiv.ll | 4 +- llvm/test/CodeGen/AMDGPU/udiv64.ll | 19 +- llvm/test/CodeGen/AMDGPU/udivrem.ll | 2 +- llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll | 28 +- .../umin-sub-to-usubo-select-combine.ll | 2 +- .../AMDGPU/undef-handling-crash-in-ra.ll | 3 +- .../AMDGPU/undefined-subreg-liverange.ll | 3 - .../unspill-vgpr-after-rewrite-vgpr-mfma.ll | 36 +- llvm/test/CodeGen/AMDGPU/urem64.ll | 9 +- llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 6 +- llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 4 +- .../CodeGen/AMDGPU/vector_shuffle.packed.ll | 4 +- llvm/test/CodeGen/AMDGPU/wave32.ll | 5 +- .../CodeGen/AMDGPU/whole-wave-functions.ll | 5 +- llvm/test/CodeGen/AMDGPU/wqm.ll | 2 +- llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 16 +- llvm/test/CodeGen/AMDGPU/xor.ll | 14 +- 301 files changed, 11638 insertions(+), 14323 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll index d6f1b142b36e0..6060e9366cad0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll @@ -438,9 +438,9 @@ define void @s_uaddo_uadde(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, pt ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s16, s18 ; GFX7-NEXT: s_addc_u32 s5, s17, s19 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_cselect_b32 s8, 1, 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -455,8 +455,8 @@ define void @s_uaddo_uadde(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, pt ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s4, s16, s18 ; GFX9-NEXT: s_addc_u32 s5, s17, s19 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_cselect_b32 s6, 1, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX9-NEXT: v_mov_b32_e32 v0, s6 @@ -469,8 +469,8 @@ define void @s_uaddo_uadde(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, pt ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s4, s16, s18 ; GFX8-NEXT: s_addc_u32 s5, s17, s19 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll index bbee88050edb9..54f9ead913b02 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll @@ -676,8 +676,8 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) { ; GFX7-LABEL: s_saddo_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_add_u32 s4, s0, s2 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_addc_u32 s5, s1, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX7-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 @@ -693,8 +693,8 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) { ; GFX8-LABEL: s_saddo_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s4, s0, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_addc_u32 s5, s1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 @@ -710,8 +710,8 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) { ; GFX9-LABEL: s_saddo_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s4, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s5, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll index 65bc2d73b36b6..fb93eff1bec45 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll @@ -96,8 +96,8 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_add_i32_e32 v4, vcc, s4, v0 -; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -192,8 +192,8 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v0 -; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -294,8 +294,8 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) { ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_xor_b32_e32 v4, s4, v0 -; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -392,8 +392,8 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_add_i32_e32 v4, vcc, s4, v0 -; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -492,8 +492,8 @@ define amdgpu_cs void @atomic_ptr_sub_and_format(ptr addrspace(8) inreg %arg) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v0 -; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -598,8 +598,8 @@ define amdgpu_cs void @atomic_ptr_xor_and_format(ptr addrspace(8) inreg %arg) { ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_xor_b32_e32 v4, s4, v0 -; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 8063b29c29985..841de28cd4f82 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -8,6 +8,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s + ; TODO: Delete this and add run lines to use *-atomicrmw-fmax.ll tests define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr, float %val) { @@ -1823,10 +1824,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1865,10 +1864,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1989,10 +1986,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2029,10 +2026,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2079,9 +2076,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v10, v3 ; GFX908-NEXT: v_mov_b32_e32 v9, v2 +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v8, v1 ; GFX908-NEXT: v_mov_b32_e32 v7, v0 ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc @@ -2108,9 +2105,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v10, v3 ; GFX8-NEXT: v_mov_b32_e32 v9, v2 +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index 5b0b602bd99ba..685eaabeb20b0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -8,6 +8,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s + ; TODO: Delete this and add run lines to use *-atomicrmw-fmin.ll tests define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr, float %val) { @@ -1823,10 +1824,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1865,10 +1864,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1989,10 +1986,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -2029,10 +2026,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2079,9 +2076,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v10, v3 ; GFX908-NEXT: v_mov_b32_e32 v9, v2 +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v8, v1 ; GFX908-NEXT: v_mov_b32_e32 v7, v0 ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc @@ -2108,9 +2105,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v10, v3 ; GFX8-NEXT: v_mov_b32_e32 v9, v2 +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll index dac726df5decb..9fabaab9bca44 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -25,9 +25,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -43,9 +43,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -108,9 +108,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -126,9 +126,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -1109,9 +1109,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1208,9 +1208,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1375,9 +1375,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1461,9 +1461,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1579,10 +1579,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -1689,11 +1689,11 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -1719,16 +1719,16 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_add_u32 s0, s0, 4 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v4, s0 @@ -1743,16 +1743,16 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s0, 4 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 @@ -1829,15 +1829,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_add_u32 s0, s0, 4 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v4, s0 @@ -1855,15 +1855,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s0, 4 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 @@ -1940,9 +1940,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1956,9 +1956,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2003,9 +2003,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v3, s1 ; GFX11-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2027,8 +2027,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2045,8 +2045,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2080,8 +2080,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 32 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2093,9 +2093,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v3, s1 ; GFX11-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2118,8 +2118,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2136,8 +2136,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2171,8 +2171,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 32 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2184,9 +2184,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v3, s1 ; GFX11-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2218,8 +2218,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: v_add_i32_e32 v4, vcc, 4, v2 @@ -2247,8 +2247,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v2 @@ -2265,16 +2265,16 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -2301,8 +2301,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -2313,10 +2313,11 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -2324,7 +2325,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo @@ -2390,8 +2391,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc @@ -2429,10 +2430,11 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -2460,10 +2462,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; CI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 -; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 +; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2481,10 +2483,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; VI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2559,9 +2561,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -2578,9 +2580,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -2647,9 +2649,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -2666,9 +2668,9 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -2856,9 +2858,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2875,9 +2877,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2942,8 +2944,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2963,8 +2965,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3030,8 +3032,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3051,8 +3053,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3115,9 +3117,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3131,9 +3133,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3191,8 +3193,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3209,8 +3211,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3269,8 +3271,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3287,8 +3289,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3356,8 +3358,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3382,8 +3384,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3539,10 +3541,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; CI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: v_mov_b32_e32 v4, s3 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 +; CI-NEXT: v_mov_b32_e32 v4, s3 ; CI-NEXT: v_mov_b32_e32 v3, s2 ; CI-NEXT: flat_store_dword v[3:4], v0 ; CI-NEXT: v_mov_b32_e32 v4, s1 @@ -3561,10 +3563,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; VI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: flat_store_dword v[3:4], v0 ; VI-NEXT: v_mov_b32_e32 v4, s1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index 7f3e24f97b6e2..1b93d7c265904 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -26,9 +26,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -44,9 +44,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -121,9 +121,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -139,9 +139,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -1082,10 +1082,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 -; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 +; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1103,10 +1103,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1198,9 +1198,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1217,9 +1217,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1299,9 +1299,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1318,9 +1318,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1543,9 +1543,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1562,9 +1562,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1641,8 +1641,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1662,8 +1662,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1741,8 +1741,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1762,8 +1762,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1840,9 +1840,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1856,9 +1856,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1927,8 +1927,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1945,8 +1945,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2016,8 +2016,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2034,8 +2034,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2116,8 +2116,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -2142,8 +2142,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -2502,9 +2502,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -2516,9 +2516,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX12-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 42 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -2614,9 +2614,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -2628,9 +2628,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; GFX12-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 42 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS @@ -2807,9 +2807,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2820,9 +2820,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; GFX12-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 42 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -2904,9 +2904,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2917,9 +2917,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; GFX12-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 42 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, 42 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 scope:SCOPE_SYS @@ -3035,10 +3035,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -3057,10 +3057,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -3167,11 +3167,11 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -3186,11 +3186,11 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX12-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -3218,10 +3218,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; CI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: v_mov_b32_e32 v4, s3 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 +; CI-NEXT: v_mov_b32_e32 v4, s3 ; CI-NEXT: v_mov_b32_e32 v3, s2 ; CI-NEXT: flat_store_dword v[3:4], v0 ; CI-NEXT: v_mov_b32_e32 v4, s1 @@ -3240,10 +3240,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; VI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: flat_store_dword v[3:4], v0 ; VI-NEXT: v_mov_b32_e32 v4, s1 @@ -3336,16 +3336,16 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_add_u32 s0, s0, 4 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v4, s0 @@ -3360,16 +3360,16 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s0, 4 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 @@ -3460,15 +3460,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_add_u32 s0, s0, 4 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v4, s0 @@ -3486,15 +3486,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s0, 4 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 @@ -3588,15 +3588,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_add_u32 s0, s0, 4 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v4, s0 @@ -3614,15 +3614,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s0, 4 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 @@ -3715,9 +3715,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3731,9 +3731,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3778,9 +3778,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v3, s1 ; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3792,9 +3792,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 42 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -3814,8 +3814,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3832,8 +3832,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3867,8 +3867,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 32 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3880,9 +3880,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v3, s1 ; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3894,9 +3894,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 42 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -3917,8 +3917,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3935,8 +3935,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3970,8 +3970,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 32 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3983,9 +3983,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v3, s1 ; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3997,9 +3997,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 42 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS @@ -4031,8 +4031,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: v_add_i32_e32 v4, vcc, 4, v2 @@ -4060,8 +4060,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v2 @@ -4078,16 +4078,16 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -4114,8 +4114,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -4126,10 +4126,11 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -4137,7 +4138,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo @@ -4149,17 +4150,18 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v2, 42 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX12-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -4226,8 +4228,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc @@ -4265,10 +4267,11 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 42 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -4284,10 +4287,11 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_mov_b32_e32 v2, 42 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -4316,9 +4320,9 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_rtn_u32 v3, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -4339,9 +4343,9 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_rtn_u32 v3, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll index 62a5313dc8d3c..af29a2f7ba6ca 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll @@ -8,11 +8,10 @@ define void @main(<19 x i32> %arg) { ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, 0 -; GCN-NEXT: s_mov_b32 s12, s4 ; GCN-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b32 s12, s4 ; GCN-NEXT: s_mov_b32 s13, s4 -; GCN-NEXT: v_mov_b32_e32 v4, s12 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_mov_b32 s6, s4 ; GCN-NEXT: s_mov_b32 s7, s4 @@ -23,6 +22,7 @@ define void @main(<19 x i32> %arg) { ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: v_mov_b32_e32 v4, s12 ; GCN-NEXT: v_mov_b32_e32 v5, s13 ; GCN-NEXT: image_store v[0:3], v[4:5], s[4:11] unorm ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -51,7 +51,7 @@ define void @main(<19 x i32> %arg) { ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: v_mov_b32_e32 v2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll index 94b956ef254a5..e437877956a93 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -476,10 +476,10 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -696,6 +696,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; VI-NEXT: flat_load_ubyte v3, v[6:7] ; VI-NEXT: flat_load_ubyte v4, v[4:5] ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -706,7 +707,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v1, v2, v3 ; VI-NEXT: v_or_b32_e32 v3, v1, v0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 @@ -884,9 +884,9 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; VI-NEXT: flat_load_ubyte v4, v[8:9] ; VI-NEXT: flat_load_ubyte v5, v[10:11] ; VI-NEXT: flat_load_ubyte v6, v[12:13] -; VI-NEXT: v_mov_b32_e32 v8, s1 ; VI-NEXT: v_mov_b32_e32 v7, s0 ; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: v_mov_b32_e32 v8, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v10, s1 ; VI-NEXT: v_mov_b32_e32 v9, s0 @@ -957,9 +957,9 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_mov_b32_e32 v8, s0 ; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v11, s1 ; VI-NEXT: v_mov_b32_e32 v10, s0 @@ -1013,11 +1013,11 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1059,10 +1059,10 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1104,10 +1104,10 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1174,6 +1174,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; VI-NEXT: flat_load_ubyte v3, v[6:7] ; VI-NEXT: flat_load_ubyte v4, v[4:5] ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -1184,7 +1185,6 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v1, v2, v3 ; VI-NEXT: v_or_b32_e32 v3, v1, v0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 @@ -1229,10 +1229,10 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1273,10 +1273,10 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1318,10 +1318,10 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1362,10 +1362,10 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll index 121dd309fddf9..cb2edcfba4ee5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll @@ -557,14 +557,14 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: s_mov_b32 s8, exec_lo ; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 ; GFX10-NEXT: s_andn2_b32 s1, s7, exec_lo -; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: s_and_b32 s7, exec_lo, s8 ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo ; GFX10-NEXT: s_and_b32 s8, exec_lo, exec_lo -; GFX10-NEXT: s_or_b32 s7, s1, s7 ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo +; GFX10-NEXT: s_or_b32 s7, s1, s7 ; GFX10-NEXT: s_or_b32 s6, s6, s8 ; GFX10-NEXT: global_load_dword v6, v[6:7], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -573,8 +573,8 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: s_cbranch_execz .LBB7_1 ; GFX10-NEXT: ; %bb.3: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB7_2 Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: s_add_i32 s2, s0, 1 ; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64 ; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll index 5c57d355959ef..fda8d4187d42a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll @@ -123,8 +123,8 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: s_ashr_i32 s1, s0, 31 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 ; GFX10-NEXT: s_andn2_b32 s1, s5, exec_lo -; GFX10-NEXT: v_mov_b32_e32 v5, s3 ; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 ; GFX10-NEXT: s_and_b32 s5, exec_lo, exec_lo ; GFX10-NEXT: s_or_b32 s5, s1, s5 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v2, v4 @@ -136,8 +136,8 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: s_cbranch_execz .LBB2_1 ; GFX10-NEXT: ; %bb.3: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 ; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 ; GFX10-NEXT: s_add_i32 s2, s0, 1 ; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64 ; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 @@ -145,8 +145,8 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo ; GFX10-NEXT: s_andn2_b32 s3, s5, exec_lo ; GFX10-NEXT: s_and_b32 s0, exec_lo, s0 -; GFX10-NEXT: s_or_b32 s5, s3, s0 ; GFX10-NEXT: global_load_dword v6, v[4:5], off +; GFX10-NEXT: s_or_b32 s5, s3, s0 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v6 @@ -203,8 +203,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_ashr_i32 s1, s0, 31 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 ; GFX10-NEXT: s_andn2_b32 s1, s5, exec_lo -; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: s_and_b32 s5, exec_lo, exec_lo ; GFX10-NEXT: s_or_b32 s5, s1, s5 ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 @@ -216,8 +216,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_cbranch_execz .LBB3_2 ; GFX10-NEXT: ; %bb.4: ; %B ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v4, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v5, v7, vcc_lo @@ -228,8 +228,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_cbranch_execz .LBB3_1 ; GFX10-NEXT: ; %bb.5: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: s_add_i32 s2, s0, 1 ; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64 ; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 @@ -237,8 +237,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo ; GFX10-NEXT: s_andn2_b32 s3, s6, exec_lo ; GFX10-NEXT: s_and_b32 s0, exec_lo, s0 -; GFX10-NEXT: s_or_b32 s6, s3, s0 ; GFX10-NEXT: global_load_dword v8, v[6:7], off +; GFX10-NEXT: s_or_b32 s6, s3, s0 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8 @@ -307,8 +307,8 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_ashr_i32 s1, s0, 31 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 ; GFX10-NEXT: s_andn2_b32 s1, s5, exec_lo -; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: v_mov_b32_e32 v8, s2 +; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: s_and_b32 s5, exec_lo, exec_lo ; GFX10-NEXT: s_or_b32 s5, s1, s5 ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v2, v8 @@ -320,8 +320,8 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_cbranch_execz .LBB4_3 ; GFX10-NEXT: ; %bb.5: ; %B ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: v_mov_b32_e32 v8, s2 +; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, v8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v9, vcc_lo @@ -332,8 +332,8 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_cbranch_execz .LBB4_2 ; GFX10-NEXT: ; %bb.6: ; %C ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: v_mov_b32_e32 v8, s2 +; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: s_mov_b32 s8, exec_lo ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v6, v8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v7, v9, vcc_lo @@ -344,8 +344,8 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: s_cbranch_execz .LBB4_1 ; GFX10-NEXT: ; %bb.7: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: v_mov_b32_e32 v8, s2 +; GFX10-NEXT: v_mov_b32_e32 v9, s3 ; GFX10-NEXT: s_add_i32 s2, s0, 1 ; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64 ; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 @@ -353,8 +353,8 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo ; GFX10-NEXT: s_andn2_b32 s3, s8, exec_lo ; GFX10-NEXT: s_and_b32 s0, exec_lo, s0 -; GFX10-NEXT: s_or_b32 s8, s3, s0 ; GFX10-NEXT: global_load_dword v10, v[8:9], off +; GFX10-NEXT: s_or_b32 s8, s3, s0 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v10 @@ -427,14 +427,14 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad ; GFX10-NEXT: s_mov_b32 s8, exec_lo ; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 ; GFX10-NEXT: s_andn2_b32 s1, s7, exec_lo -; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: s_and_b32 s7, exec_lo, s8 ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo ; GFX10-NEXT: s_and_b32 s8, exec_lo, exec_lo -; GFX10-NEXT: s_or_b32 s7, s1, s7 ; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo +; GFX10-NEXT: s_or_b32 s7, s1, s7 ; GFX10-NEXT: s_or_b32 s6, s6, s8 ; GFX10-NEXT: global_load_dword v6, v[6:7], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -443,8 +443,8 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad ; GFX10-NEXT: s_cbranch_execz .LBB5_1 ; GFX10-NEXT: ; %bb.3: ; %loop.body ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: s_add_i32 s2, s0, 1 ; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64 ; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll index a8b27ecd7e9fc..eab7a43c32c5c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll @@ -414,8 +414,8 @@ define void @nested_loops_temporal_divergence_both(float %pre.cond.val, i32 %n.i ; GFX10-NEXT: ; %bb.3: ; %UseInst ; GFX10-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s10 -; GFX10-NEXT: v_mov_b32_e32 v9, s7 ; GFX10-NEXT: v_mov_b32_e32 v8, s6 +; GFX10-NEXT: v_mov_b32_e32 v9, s7 ; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s9 ; GFX10-NEXT: s_add_i32 s6, s6, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll index 9dfd0a47d1e1e..bc47a8bc1bec7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -41,10 +41,10 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(ptr addrspace(4) inre ; GFX7-NEXT: s_and_b32 s2, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, s2, 4 ; GFX7-NEXT: s_mov_b32 s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -89,10 +89,10 @@ define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr ; GFX9-LABEL: extractelement_vgpr_v4i128_sgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s2, 3 -; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off @@ -106,10 +106,10 @@ define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr ; GFX8-LABEL: extractelement_vgpr_v4i128_sgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s2, 3 -; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll index 798f6eb65e6aa..f87a0385616d0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll @@ -35,10 +35,10 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(ptr addrspace(4) inreg ; GFX7-NEXT: s_and_b32 s2, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, s2, 1 ; GFX7-NEXT: s_mov_b32 s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -74,10 +74,10 @@ define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(ptr addrspace(1) %ptr, ; GFX9-LABEL: extractelement_vgpr_v4i16_sgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s2, 3 -; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: global_load_ushort v0, v[0:1], off @@ -88,10 +88,10 @@ define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(ptr addrspace(1) %ptr, ; GFX8-LABEL: extractelement_vgpr_v4i16_sgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s2, 3 -; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: flat_load_ushort v0, v[0:1] @@ -679,10 +679,10 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_sgpr_idx(ptr addrspace(4) inreg ; GFX7-NEXT: s_and_b32 s2, s4, 7 ; GFX7-NEXT: s_lshl_b32 s4, s2, 1 ; GFX7-NEXT: s_mov_b32 s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -718,10 +718,10 @@ define amdgpu_ps i16 @extractelement_vgpr_v8i16_sgpr_idx(ptr addrspace(1) %ptr, ; GFX9-LABEL: extractelement_vgpr_v8i16_sgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s2, 7 -; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: global_load_ushort v0, v[0:1], off @@ -732,10 +732,10 @@ define amdgpu_ps i16 @extractelement_vgpr_v8i16_sgpr_idx(ptr addrspace(1) %ptr, ; GFX8-LABEL: extractelement_vgpr_v8i16_sgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s2, 7 -; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: flat_load_ushort v0, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll index de1079196223a..44ed74fd072b1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -35,11 +35,11 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(ptr addrspace(4) inreg %p ; GFX7: ; %bb.0: ; GFX7-NEXT: s_and_b32 s4, s4, 3 ; GFX7-NEXT: s_ashr_i32 s5, s4, 31 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -79,8 +79,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(ptr addrspace(1) %ptr, i3 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s2, 3 ; GFX9-NEXT: s_ashr_i32 s1, s0, 31 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off @@ -92,8 +92,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(ptr addrspace(1) %ptr, i3 ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s2, 3 ; GFX8-NEXT: s_ashr_i32 s1, s0, 31 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] @@ -116,8 +116,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(ptr addrspace(1) %ptr, i3 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s0, s2, 3 ; GFX10-NEXT: s_ashr_i32 s1, s0, 31 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off @@ -130,7 +130,7 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(ptr addrspace(1) %ptr, i3 ; GFX11-NEXT: s_and_b32 s0, s2, 3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_ashr_i32 s1, s0, 31 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo @@ -262,8 +262,8 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(ptr addrspace(4) inreg %p ; ; GFX11-LABEL: extractelement_sgpr_v4i8_vgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v2, 3, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_and_b32 v2, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 @@ -688,11 +688,11 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(ptr addrspace(4) inreg %p ; GFX7: ; %bb.0: ; GFX7-NEXT: s_and_b32 s4, s4, 7 ; GFX7-NEXT: s_ashr_i32 s5, s4, 31 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -732,8 +732,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(ptr addrspace(1) %ptr, i3 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s2, 7 ; GFX9-NEXT: s_ashr_i32 s1, s0, 31 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off @@ -745,8 +745,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(ptr addrspace(1) %ptr, i3 ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s2, 7 ; GFX8-NEXT: s_ashr_i32 s1, s0, 31 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] @@ -769,8 +769,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(ptr addrspace(1) %ptr, i3 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s0, s2, 7 ; GFX10-NEXT: s_ashr_i32 s1, s0, 31 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off @@ -783,7 +783,7 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(ptr addrspace(1) %ptr, i3 ; GFX11-NEXT: s_and_b32 s0, s2, 7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_ashr_i32 s1, s0, 31 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo @@ -915,8 +915,8 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(ptr addrspace(4) inreg %p ; ; GFX11-LABEL: extractelement_sgpr_v8i8_vgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v2, 7, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_and_b32 v2, 7, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 @@ -1725,11 +1725,11 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(ptr addrspace(4) inreg % ; GFX7: ; %bb.0: ; GFX7-NEXT: s_and_b32 s4, s4, 15 ; GFX7-NEXT: s_ashr_i32 s5, s4, 31 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1769,8 +1769,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(ptr addrspace(1) %ptr, i ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s2, 15 ; GFX9-NEXT: s_ashr_i32 s1, s0, 31 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off @@ -1782,8 +1782,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(ptr addrspace(1) %ptr, i ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s2, 15 ; GFX8-NEXT: s_ashr_i32 s1, s0, 31 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] @@ -1806,8 +1806,8 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(ptr addrspace(1) %ptr, i ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s0, s2, 15 ; GFX10-NEXT: s_ashr_i32 s1, s0, 31 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off @@ -1820,7 +1820,7 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(ptr addrspace(1) %ptr, i ; GFX11-NEXT: s_and_b32 s0, s2, 15 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_ashr_i32 s1, s0, 31 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo @@ -1952,8 +1952,8 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(ptr addrspace(4) inreg % ; ; GFX11-LABEL: extractelement_sgpr_v16i8_vgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_and_b32 v2, 15, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 206011adf0213..1f1603de6ed26 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -3183,10 +3183,10 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5] ; MOVREL-NEXT: s_cmp_eq_u32 s8, 4 ; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; MOVREL-NEXT: v_mov_b32_e32 v0, s2 -; MOVREL-NEXT: v_mov_b32_e32 v3, s1 ; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; MOVREL-NEXT: v_mov_b32_e32 v0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v1, s3 +; MOVREL-NEXT: v_mov_b32_e32 v3, s1 ; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; MOVREL-NEXT: s_endpgm @@ -4189,8 +4189,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: s_cselect_b32 s3, 0x40400000, s3 ; MOVREL-NEXT: s_cmp_eq_u32 s2, 3 ; MOVREL-NEXT: s_cselect_b32 s2, 4.0, s3 -; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: v_mov_b32_e32 v2, s2 +; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 ; MOVREL-NEXT: flat_store_dword v[0:1], v2 ; MOVREL-NEXT: s_endpgm @@ -4541,10 +4541,10 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; MOVREL-NEXT: s_cmp_eq_u32 s6, 3 ; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] -; MOVREL-NEXT: v_mov_b32_e32 v0, s2 -; MOVREL-NEXT: v_mov_b32_e32 v3, s1 ; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; MOVREL-NEXT: v_mov_b32_e32 v0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v1, s3 +; MOVREL-NEXT: v_mov_b32_e32 v3, s1 ; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; MOVREL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll index 39a793ce67bb9..79610c43fdbca 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll @@ -254,7 +254,7 @@ define amdgpu_ps void @s_fabs_v2f32(<2 x float> inreg %in, ptr addrspace(1) %out ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; @@ -263,7 +263,7 @@ define amdgpu_ps void @s_fabs_v2f32(<2 x float> inreg %in, ptr addrspace(1) %out ; GFX12-NEXT: s_bitset0_b32 s0, 31 ; GFX12-NEXT: s_bitset0_b32 s1, 31 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll index 3d224f2f6bf05..9429576474825 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -new-reg-bank-select -mattr=+enable-flat-scratch -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=MESA %s ; RUN: llc -global-isel -new-reg-bank-select -mattr=+enable-flat-scratch -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=PAL %s + ; Test that the initialization for flat_scratch doesn't crash. PAL ; doesn't add a user SGPR for initializing flat_scratch, mesa does ; (although this probably isn't actually defined). @@ -10,11 +11,11 @@ define amdgpu_ps void @amdgpu_ps() { ; MESA-LABEL: amdgpu_ps: ; MESA: ; %bb.0: ; MESA-NEXT: s_mov_b64 s[0:1], src_private_base -; MESA-NEXT: s_mov_b32 s0, 0 ; MESA-NEXT: s_add_u32 flat_scratch_lo, s2, s4 -; MESA-NEXT: v_mov_b32_e32 v0, s0 +; MESA-NEXT: s_mov_b32 s0, 0 ; MESA-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; MESA-NEXT: v_mov_b32_e32 v2, 0 +; MESA-NEXT: v_mov_b32_e32 v0, s0 ; MESA-NEXT: v_mov_b32_e32 v1, s1 ; MESA-NEXT: flat_store_dword v[0:1], v2 ; MESA-NEXT: s_waitcnt vmcnt(0) @@ -31,8 +32,8 @@ define amdgpu_ps void @amdgpu_ps() { ; PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0 ; PAL-NEXT: s_mov_b64 s[0:1], src_private_base ; PAL-NEXT: s_mov_b32 s0, 0 -; PAL-NEXT: v_mov_b32_e32 v0, s0 ; PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 +; PAL-NEXT: v_mov_b32_e32 v0, s0 ; PAL-NEXT: v_mov_b32_e32 v1, s1 ; PAL-NEXT: flat_store_dword v[0:1], v2 ; PAL-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll index ebc28cb005538..7e1c405c0d6d1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll @@ -254,7 +254,7 @@ define amdgpu_ps void @s_fneg_v2f32(<2 x float> inreg %in, ptr addrspace(1) %out ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; @@ -263,7 +263,7 @@ define amdgpu_ps void @s_fneg_v2f32(<2 x float> inreg %in, ptr addrspace(1) %out ; GFX12-NEXT: s_xor_b32 s0, s0, 0x80000000 ; GFX12-NEXT: s_xor_b32 s1, s1, 0x80000000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm %fneg = fneg <2 x float> %in diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll index 91ee7642790fc..5a527c61df424 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -2,11 +2,12 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V5 %s ; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V5 %s - ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V4 %s ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s ; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s + + define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) { ; GFX8V4-LABEL: addrspacecast: ; GFX8V4: ; %bb.0: @@ -15,16 +16,16 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8V4-NEXT: s_add_u32 s2, s6, 0x44 ; GFX8V4-NEXT: s_addc_u32 s3, s7, 0 -; GFX8V4-NEXT: v_mov_b32_e32 v0, s2 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1 -; GFX8V4-NEXT: v_mov_b32_e32 v1, s3 +; GFX8V4-NEXT: v_mov_b32_e32 v0, s2 ; GFX8V4-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8V4-NEXT: s_and_b32 s4, 1, s2 -; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8V4-NEXT: s_add_u32 s2, s6, 64 -; GFX8V4-NEXT: flat_load_dword v3, v[0:1] +; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s3 ; GFX8V4-NEXT: s_addc_u32 s3, s7, 0 +; GFX8V4-NEXT: flat_load_dword v3, v[0:1] ; GFX8V4-NEXT: v_mov_b32_e32 v0, s2 ; GFX8V4-NEXT: v_mov_b32_e32 v1, s3 ; GFX8V4-NEXT: flat_load_dword v4, v[0:1] @@ -59,17 +60,17 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V5-NEXT: s_mov_b32 s4, s0 ; GFX8V5-NEXT: s_mov_b32 s5, s2 ; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1 -; GFX8V5-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 ; GFX8V5-NEXT: s_mov_b32 s2, s1 +; GFX8V5-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 ; GFX8V5-NEXT: s_cmp_lg_u32 s1, -1 -; GFX8V5-NEXT: v_mov_b32_e32 v0, s4 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], s[2:3], 0 ; GFX8V5-NEXT: v_mov_b32_e32 v2, 1 +; GFX8V5-NEXT: v_mov_b32_e32 v0, s4 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s5 ; GFX8V5-NEXT: flat_store_dword v[0:1], v2 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) -; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V5-NEXT: v_mov_b32_e32 v2, 2 +; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s1 ; GFX8V5-NEXT: flat_store_dword v[0:1], v2 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) @@ -85,17 +86,17 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_mov_b32 s0, s4 ; GFX9V4-NEXT: s_cmp_lg_u32 s4, -1 -; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 ; GFX9V4-NEXT: s_mov_b32 s2, s5 +; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 ; GFX9V4-NEXT: s_cmp_lg_u32 s5, -1 -; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V4-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 ; GFX9V4-NEXT: v_mov_b32_e32 v2, 1 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V4-NEXT: v_mov_b32_e32 v1, s1 ; GFX9V4-NEXT: flat_store_dword v[0:1], v2 ; GFX9V4-NEXT: s_waitcnt vmcnt(0) -; GFX9V4-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V4-NEXT: v_mov_b32_e32 v2, 2 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V4-NEXT: v_mov_b32_e32 v1, s3 ; GFX9V4-NEXT: flat_store_dword v[0:1], v2 ; GFX9V4-NEXT: s_waitcnt vmcnt(0) @@ -111,17 +112,17 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_mov_b32 s0, s4 ; GFX9V5-NEXT: s_cmp_lg_u32 s4, -1 -; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 ; GFX9V5-NEXT: s_mov_b32 s2, s5 +; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 ; GFX9V5-NEXT: s_cmp_lg_u32 s5, -1 -; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V5-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 ; GFX9V5-NEXT: v_mov_b32_e32 v2, 1 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V5-NEXT: v_mov_b32_e32 v1, s1 ; GFX9V5-NEXT: flat_store_dword v[0:1], v2 ; GFX9V5-NEXT: s_waitcnt vmcnt(0) -; GFX9V5-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V5-NEXT: v_mov_b32_e32 v2, 2 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V5-NEXT: v_mov_b32_e32 v1, s3 ; GFX9V5-NEXT: flat_store_dword v[0:1], v2 ; GFX9V5-NEXT: s_waitcnt vmcnt(0) @@ -295,11 +296,11 @@ define amdgpu_kernel void @llvm_debugtrap() #0 { define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_add_u32 s0, s8, 8 ; GFX8V4-NEXT: v_mov_b32_e32 v0, s6 ; GFX8V4-NEXT: v_mov_b32_e32 v1, s7 -; GFX8V4-NEXT: s_add_u32 s0, s8, 8 -; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V4-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V4-NEXT: s_waitcnt vmcnt(0) ; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V4-NEXT: v_mov_b32_e32 v1, s1 @@ -321,11 +322,11 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { ; ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_add_u32 s0, s8, 8 ; GFX8V5-NEXT: v_mov_b32_e32 v0, s6 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s7 -; GFX8V5-NEXT: s_add_u32 s0, s8, 8 -; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V5-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll index 0e1bbbd1ea92b..8b9dca591d5fb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -1128,9 +1128,9 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 ; GFX10-NEXT: v_and_or_b32 v5, v5, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1150,11 +1150,11 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_lshlrev_b32 v1, 4, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v5, s0, v0 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, s2 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 ; GFX11-NEXT: v_not_b32_e32 v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -1263,8 +1263,8 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX10-NEXT: v_not_b32_e32 v3, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 ; GFX10-NEXT: v_and_or_b32 v5, v5, v3, v2 @@ -1288,8 +1288,8 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v2, v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 ; GFX11-NEXT: v_not_b32_e32 v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -1768,8 +1768,8 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: s_cselect_b32 s2, s4, s2 ; GFX7-NEXT: s_cmp_eq_u32 s6, 3 ; GFX7-NEXT: s_cselect_b32 s3, s4, s3 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 @@ -1801,14 +1801,14 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(ptr addrspace(4) inreg %ptr, i1 ; GFX10-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10-NEXT: s_cselect_b32 s0, s4, s0 ; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_cselect_b32 s1, s4, s1 ; GFX10-NEXT: s_cmp_eq_u32 s6, 2 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_cselect_b32 s2, s4, s2 ; GFX10-NEXT: s_cmp_eq_u32 s6, 3 -; GFX10-NEXT: s_cselect_b32 s3, s4, s3 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_cselect_b32 s3, s4, s3 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm @@ -1836,15 +1836,16 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: s_cmp_eq_u32 s6, 0 ; GFX11-NEXT: s_cselect_b32 s0, s4, s0 ; GFX11-NEXT: s_cmp_eq_u32 s6, 1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_cselect_b32 s1, s4, s1 ; GFX11-NEXT: s_cmp_eq_u32 s6, 2 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_cselect_b32 s2, s4, s2 ; GFX11-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; GFX11-NEXT: s_cselect_b32 s3, s4, s3 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, ptr addrspace(4) %ptr @@ -2315,13 +2316,13 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX10-NEXT: v_not_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s9 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 @@ -2338,27 +2339,29 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 1, v0 ; GFX11-NEXT: v_and_b32_e32 v1, 1, v0 ; GFX11-NEXT: s_and_b32 s1, s4, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_lshlrev_b32 v1, 4, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, s11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v1, s1 ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 +; GFX11-NEXT: v_mov_b32_e32 v1, s9 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_not_b32_e32 v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, s10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 -; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 @@ -2497,13 +2500,13 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX10-NEXT: v_not_b32_e32 v5, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 @@ -2530,14 +2533,14 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 ; GFX11-NEXT: v_not_b32_e32 v5, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_mov_b32_e32 v3, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_lshlrev_b32 v4, v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 @@ -3053,17 +3056,17 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(ptr addrspace(4) inreg %ptr, i ; GFX9-NEXT: s_cmp_eq_u32 s7, 5 ; GFX9-NEXT: s_cselect_b32 s5, s16, s13 ; GFX9-NEXT: s_cmp_eq_u32 s7, 6 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_cselect_b32 s6, s16, s14 ; GFX9-NEXT: s_cmp_eq_u32 s7, 7 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_cselect_b32 s7, s16, s15 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: v_mov_b32_e32 v4, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 @@ -3083,17 +3086,17 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(ptr addrspace(4) inreg %ptr, i ; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_movreld_b32 s8, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: s_movreld_b32 s8, s0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NEXT: v_mov_b32_e32 v3, s15 @@ -3114,8 +3117,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(ptr addrspace(4) inreg %ptr, i ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 ; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_movreld_b32 s8, s0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: v_mov_b32_e32 v3, s11 @@ -3725,30 +3728,30 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_not_b32_e32 v9, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, s12 +; GFX10-NEXT: v_mov_b32_e32 v5, s13 +; GFX10-NEXT: v_mov_b32_e32 v6, s14 +; GFX10-NEXT: v_mov_b32_e32 v7, s15 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v1, s15, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_and_or_b32 v13, v10, v9, v8 -; GFX10-NEXT: v_mov_b32_e32 v4, s12 -; GFX10-NEXT: v_mov_b32_e32 v5, s13 -; GFX10-NEXT: v_mov_b32_e32 v6, s14 -; GFX10-NEXT: v_mov_b32_e32 v7, s15 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: v_mov_b32_e32 v10, 16 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 @@ -3778,9 +3781,13 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, s9 ; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 -; GFX11-NEXT: v_not_b32_e32 v9, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s8 +; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo +; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; GFX11-NEXT: v_not_b32_e32 v9, v2 +; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 @@ -3788,20 +3795,18 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v10, v1, s15, s5 -; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 -; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 +; GFX11-NEXT: v_mov_b32_e32 v1, s9 ; GFX11-NEXT: v_and_or_b32 v13, v10, v9, v8 -; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 -; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16 +; GFX11-NEXT: v_mov_b32_e32 v11, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 -; GFX11-NEXT: v_mov_b32_e32 v11, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 @@ -3973,11 +3978,11 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i ; GFX7-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 ; GFX7-NEXT: v_mov_b32_e32 v4, s16 ; GFX7-NEXT: v_mov_b32_e32 v5, s17 ; GFX7-NEXT: v_mov_b32_e32 v6, s18 ; GFX7-NEXT: v_mov_b32_e32 v7, s19 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] @@ -4014,30 +4019,30 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: v_not_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, s12 +; GFX10-NEXT: v_mov_b32_e32 v5, s13 +; GFX10-NEXT: v_mov_b32_e32 v6, s14 +; GFX10-NEXT: v_mov_b32_e32 v7, s15 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v2, s15, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_and_or_b32 v13, v10, v9, v8 -; GFX10-NEXT: v_mov_b32_e32 v4, s12 -; GFX10-NEXT: v_mov_b32_e32 v5, s13 -; GFX10-NEXT: v_mov_b32_e32 v6, s14 -; GFX10-NEXT: v_mov_b32_e32 v7, s15 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: v_mov_b32_e32 v10, 16 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 @@ -4062,37 +4067,39 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i ; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 ; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, s9 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, v1, v0 -; GFX11-NEXT: v_not_b32_e32 v9, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v5, s13 +; GFX11-NEXT: v_mov_b32_e32 v1, s9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v4, s12 +; GFX11-NEXT: v_mov_b32_e32 v6, s14 +; GFX11-NEXT: v_not_b32_e32 v9, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, s11 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v7, s15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s12, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v10, v2, s15, s5 -; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v5, s13 -; GFX11-NEXT: v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v2, s10 -; GFX11-NEXT: v_mov_b32_e32 v7, s15 -; GFX11-NEXT: v_mov_b32_e32 v3, s11 +; GFX11-NEXT: v_mov_b32_e32 v2, s10 ; GFX11-NEXT: v_and_or_b32 v13, v10, v9, v8 -; GFX11-NEXT: v_mov_b32_e32 v4, s12 -; GFX11-NEXT: v_mov_b32_e32 v6, s14 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16 +; GFX11-NEXT: v_mov_b32_e32 v11, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 -; GFX11-NEXT: v_mov_b32_e32 v11, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll index 4598bcc04a505..f50670376b0c9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -1580,8 +1580,8 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: s_cselect_b32 s2, s4, s0 ; GFX7-NEXT: s_cmp_eq_u32 s3, 1 ; GFX7-NEXT: s_cselect_b32 s3, s4, s1 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -1607,9 +1607,9 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(ptr addrspace(4) inreg %ptr, i8 ; GFX10-NEXT: s_cmp_eq_u32 s2, 0 ; GFX10-NEXT: s_cselect_b32 s0, s3, s0 ; GFX10-NEXT: s_cmp_eq_u32 s2, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_cselect_b32 s1, s3, s1 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm ; @@ -1632,10 +1632,11 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: s_cmp_eq_u32 s2, 0 ; GFX11-NEXT: s_cselect_b32 s0, s3, s0 ; GFX11-NEXT: s_cmp_eq_u32 s2, 1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: s_cselect_b32 s1, s3, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, ptr addrspace(4) %ptr @@ -1991,9 +1992,9 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX10-NEXT: v_not_b32_e32 v2, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 ; GFX10-NEXT: v_and_or_b32 v5, v5, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -2013,11 +2014,11 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_lshlrev_b32 v1, 3, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v5, s0, v0 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, s2 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 ; GFX11-NEXT: v_not_b32_e32 v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -2126,8 +2127,8 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX10-NEXT: v_not_b32_e32 v3, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 ; GFX10-NEXT: v_and_or_b32 v5, v5, v3, v2 @@ -2151,8 +2152,8 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xff -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v2, v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 ; GFX11-NEXT: v_not_b32_e32 v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -2631,8 +2632,8 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(ptr addrspace(4) inreg %ptr, i8 ; GFX7-NEXT: s_cselect_b32 s2, s4, s2 ; GFX7-NEXT: s_cmp_eq_u32 s6, 3 ; GFX7-NEXT: s_cselect_b32 s3, s4, s3 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 @@ -2664,14 +2665,14 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(ptr addrspace(4) inreg %ptr, i8 ; GFX10-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10-NEXT: s_cselect_b32 s0, s4, s0 ; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_cselect_b32 s1, s4, s1 ; GFX10-NEXT: s_cmp_eq_u32 s6, 2 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_cselect_b32 s2, s4, s2 ; GFX10-NEXT: s_cmp_eq_u32 s6, 3 -; GFX10-NEXT: s_cselect_b32 s3, s4, s3 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_cselect_b32 s3, s4, s3 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm @@ -2699,15 +2700,16 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: s_cmp_eq_u32 s6, 0 ; GFX11-NEXT: s_cselect_b32 s0, s4, s0 ; GFX11-NEXT: s_cmp_eq_u32 s6, 1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_cselect_b32 s1, s4, s1 ; GFX11-NEXT: s_cmp_eq_u32 s6, 2 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_cselect_b32 s2, s4, s2 ; GFX11-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; GFX11-NEXT: s_cselect_b32 s3, s4, s3 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, ptr addrspace(4) %ptr @@ -3178,13 +3180,13 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX10-NEXT: v_not_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s9 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 @@ -3201,27 +3203,29 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 2, v0 ; GFX11-NEXT: v_and_b32_e32 v1, 3, v0 ; GFX11-NEXT: s_and_b32 s1, s4, 0xff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_lshlrev_b32 v1, 3, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, s11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v1, s1 ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 +; GFX11-NEXT: v_mov_b32_e32 v1, s9 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_not_b32_e32 v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, s10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 -; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 @@ -3360,13 +3364,13 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX10-NEXT: v_not_b32_e32 v5, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 @@ -3393,14 +3397,14 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xff -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 ; GFX11-NEXT: v_not_b32_e32 v5, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_mov_b32_e32 v3, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_lshlrev_b32 v4, v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index 533b25ef1a0c0..c53364de8df88 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -274,6 +274,8 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_s(<8 x float> inreg %v ; GFX10-LABEL: dyn_insertelement_v8f32_s_v_s: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -281,8 +283,6 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_s(<8 x float> inreg %v ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 m0, s10 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -298,18 +298,18 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_s(<8 x float> inreg %v ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_mov_b32_e32 v8, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_mov_b32 m0, s10 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4 -; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_mov_b32_e32 v6, s6 ; GFX11-NEXT: v_movreld_b32_e32 v0, v8 ; GFX11-NEXT: ; return to shader part epilog entry: @@ -686,23 +686,27 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f64_const_s_v_v: ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: s_mov_b32 s18, 0 ; GPRIDX-NEXT: s_mov_b32 s16, 0 +; GPRIDX-NEXT: s_mov_b32 s18, 0 +; GPRIDX-NEXT: s_mov_b32 s17, 0x401c0000 ; GPRIDX-NEXT: s_mov_b32 s14, 0 ; GPRIDX-NEXT: s_mov_b32 s12, 0 ; GPRIDX-NEXT: s_mov_b32 s8, 0 +; GPRIDX-NEXT: s_mov_b64 s[6:7], 2.0 ; GPRIDX-NEXT: s_mov_b64 s[4:5], 1.0 ; GPRIDX-NEXT: s_mov_b32 s19, 0x40200000 -; GPRIDX-NEXT: s_mov_b32 s17, 0x401c0000 ; GPRIDX-NEXT: s_mov_b32 s15, 0x40180000 ; GPRIDX-NEXT: s_mov_b32 s13, 0x40140000 ; GPRIDX-NEXT: s_mov_b64 s[10:11], 4.0 ; GPRIDX-NEXT: s_mov_b32 s9, 0x40080000 -; GPRIDX-NEXT: s_mov_b64 s[6:7], 2.0 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s4 ; GPRIDX-NEXT: v_mov_b32_e32 v4, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v5, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v6, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s16 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s17 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v2 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s8 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s9 ; GPRIDX-NEXT: v_mov_b32_e32 v9, s10 @@ -711,12 +715,8 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GPRIDX-NEXT: v_mov_b32_e32 v12, s13 ; GPRIDX-NEXT: v_mov_b32_e32 v13, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s16 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s17 ; GPRIDX-NEXT: v_mov_b32_e32 v17, s18 ; GPRIDX-NEXT: v_mov_b32_e32 v18, s19 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 2, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 3, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 4, v2 @@ -753,58 +753,58 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b64 s[4:5], 1.0 -; GFX10-NEXT: s_mov_b32 s18, 0 -; GFX10-NEXT: s_mov_b32 s16, 0 -; GFX10-NEXT: s_mov_b32 s14, 0 -; GFX10-NEXT: s_mov_b32 s12, 0 -; GFX10-NEXT: s_mov_b32 s8, 0 -; GFX10-NEXT: s_mov_b32 s19, 0x40200000 -; GFX10-NEXT: s_mov_b32 s17, 0x401c0000 -; GFX10-NEXT: s_mov_b32 s15, 0x40180000 -; GFX10-NEXT: s_mov_b32 s13, 0x40140000 -; GFX10-NEXT: s_mov_b64 s[10:11], 4.0 -; GFX10-NEXT: s_mov_b32 s9, 0x40080000 -; GFX10-NEXT: s_mov_b64 s[6:7], 2.0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: v_mov_b32_e32 v4, s5 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: s_mov_b64 s[6:7], 2.0 +; GFX10-NEXT: s_mov_b32 s9, 0x40080000 ; GFX10-NEXT: v_mov_b32_e32 v5, s6 ; GFX10-NEXT: v_mov_b32_e32 v6, s7 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 ; GFX10-NEXT: v_mov_b32_e32 v7, s8 ; GFX10-NEXT: v_mov_b32_e32 v8, s9 -; GFX10-NEXT: v_mov_b32_e32 v9, s10 -; GFX10-NEXT: v_mov_b32_e32 v10, s11 -; GFX10-NEXT: v_mov_b32_e32 v11, s12 -; GFX10-NEXT: v_mov_b32_e32 v12, s13 -; GFX10-NEXT: v_mov_b32_e32 v13, s14 -; GFX10-NEXT: v_mov_b32_e32 v14, s15 -; GFX10-NEXT: v_mov_b32_e32 v15, s16 -; GFX10-NEXT: v_mov_b32_e32 v16, s17 -; GFX10-NEXT: v_mov_b32_e32 v17, s18 -; GFX10-NEXT: v_mov_b32_e32 v18, s19 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v2 +; GFX10-NEXT: s_mov_b32 s12, 0 +; GFX10-NEXT: s_mov_b64 s[10:11], 4.0 +; GFX10-NEXT: s_mov_b32 s13, 0x40140000 +; GFX10-NEXT: v_mov_b32_e32 v9, s10 +; GFX10-NEXT: v_mov_b32_e32 v10, s11 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v1, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 3, v2 +; GFX10-NEXT: v_mov_b32_e32 v11, s12 +; GFX10-NEXT: v_mov_b32_e32 v12, s13 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v17, v0, s5 +; GFX10-NEXT: s_mov_b32 s14, 0 +; GFX10-NEXT: s_mov_b32 s18, 0 +; GFX10-NEXT: s_mov_b32 s16, 0 +; GFX10-NEXT: s_mov_b32 s15, 0x40180000 +; GFX10-NEXT: s_mov_b32 s19, 0x40200000 +; GFX10-NEXT: s_mov_b32 s17, 0x401c0000 +; GFX10-NEXT: v_mov_b32_e32 v13, s14 +; GFX10-NEXT: v_mov_b32_e32 v14, s15 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v1, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v2 +; GFX10-NEXT: v_mov_b32_e32 v15, s16 +; GFX10-NEXT: v_mov_b32_e32 v16, s17 +; GFX10-NEXT: v_mov_b32_e32 v17, s18 +; GFX10-NEXT: v_mov_b32_e32 v18, s19 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v1, s5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v1, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v17, v17, v0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v1, s5 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[7:10], off @@ -818,47 +818,47 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GFX11-LABEL: dyn_insertelement_v8f64_const_s_v_v: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s14, 0 -; GFX11-NEXT: s_mov_b32 s15, 0x40200000 -; GFX11-NEXT: s_mov_b32 s12, 0 -; GFX11-NEXT: s_mov_b32 s10, 0 -; GFX11-NEXT: s_mov_b32 s8, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 1.0 -; GFX11-NEXT: s_mov_b32 s13, 0x401c0000 -; GFX11-NEXT: s_mov_b32 s11, 0x40180000 -; GFX11-NEXT: s_mov_b32 s9, 0x40140000 -; GFX11-NEXT: s_mov_b64 s[6:7], 4.0 -; GFX11-NEXT: s_mov_b32 s5, 0x40080000 -; GFX11-NEXT: s_mov_b64 s[2:3], 2.0 -; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14 -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12 -; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10 -; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8 -; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6 -; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 +; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_mov_b64 s[2:3], 2.0 +; GFX11-NEXT: s_mov_b32 s5, 0x40080000 ; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 +; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 ; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v2 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_mov_b64 s[6:7], 4.0 +; GFX11-NEXT: s_mov_b32 s9, 0x40140000 +; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 +; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8 ; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v0 :: v_dual_cndmask_b32 v8, v8, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1 +; GFX11-NEXT: s_mov_b32 s10, 0 +; GFX11-NEXT: s_mov_b32 s14, 0 +; GFX11-NEXT: s_mov_b32 s12, 0 +; GFX11-NEXT: s_mov_b32 s11, 0x40180000 +; GFX11-NEXT: s_mov_b32 s15, 0x40200000 +; GFX11-NEXT: s_mov_b32 s13, 0x401c0000 +; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 +; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14 +; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12 ; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v0 :: v_dual_cndmask_b32 v12, v12, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v0 :: v_dual_cndmask_b32 v16, v16, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1 ; GFX11-NEXT: global_store_b128 v[0:1], v[3:6], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[7:10], off dlc @@ -884,24 +884,22 @@ entry: define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, double inreg %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f64_s_s_v: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s11, s13 ; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s13, s15 ; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s14 +; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s13 ; GPRIDX-NEXT: v_mov_b32_e32 v13, s12 ; GPRIDX-NEXT: v_mov_b32_e32 v12, s11 @@ -926,6 +924,8 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GPRIDX-NEXT: v_mov_b32_e32 v0, s19 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s14 ; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[12:13] ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc ; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[12:13] @@ -954,62 +954,62 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do ; ; GFX10-LABEL: dyn_insertelement_v8f64_s_s_v: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s15, s17 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: s_mov_b32 s12, s14 -; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: v_mov_b32_e32 v16, s15 -; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_mov_b32_e32 v15, s14 -; GFX10-NEXT: v_mov_b32_e32 v14, s13 -; GFX10-NEXT: v_mov_b32_e32 v13, s12 -; GFX10-NEXT: v_mov_b32_e32 v12, s11 -; GFX10-NEXT: v_mov_b32_e32 v11, s10 -; GFX10-NEXT: v_mov_b32_e32 v10, s9 -; GFX10-NEXT: v_mov_b32_e32 v9, s8 -; GFX10-NEXT: v_mov_b32_e32 v8, s7 -; GFX10-NEXT: v_mov_b32_e32 v7, s6 -; GFX10-NEXT: v_mov_b32_e32 v6, s5 -; GFX10-NEXT: v_mov_b32_e32 v5, s4 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: v_mov_b32_e32 v4, s3 ; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s18, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s19, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 7, v0 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: v_mov_b32_e32 v8, s7 +; GFX10-NEXT: v_mov_b32_e32 v7, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s18, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s19, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v0 +; GFX10-NEXT: v_mov_b32_e32 v10, s9 +; GFX10-NEXT: v_mov_b32_e32 v9, s8 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s18, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s19, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, s18, s1 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: s_mov_b32 s15, s17 +; GFX10-NEXT: v_mov_b32_e32 v12, s11 +; GFX10-NEXT: v_mov_b32_e32 v11, s10 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s18, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, s19, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v0 +; GFX10-NEXT: v_mov_b32_e32 v16, s15 +; GFX10-NEXT: v_mov_b32_e32 v15, s14 +; GFX10-NEXT: v_mov_b32_e32 v14, s13 +; GFX10-NEXT: v_mov_b32_e32 v13, s12 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, s18, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, s19, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, s19, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 7, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, s18, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, s19, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, s18, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, s19, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, s18, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, s19, s1 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[1:4], off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[5:8], off @@ -1022,54 +1022,54 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do ; ; GFX11-LABEL: dyn_insertelement_v8f64_s_s_v: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s3, s5 -; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: s_mov_b32 s9, s11 -; GFX11-NEXT: s_mov_b32 s11, s13 -; GFX11-NEXT: s_mov_b32 s13, s15 -; GFX11-NEXT: s_mov_b32 s15, s17 ; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 -; GFX11-NEXT: s_mov_b32 s6, s8 -; GFX11-NEXT: s_mov_b32 s8, s10 -; GFX11-NEXT: s_mov_b32 s10, s12 -; GFX11-NEXT: s_mov_b32 s12, s14 -; GFX11-NEXT: s_mov_b32 s14, s16 -; GFX11-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v15, s14 -; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_dual_mov_b32 v14, s13 :: v_dual_mov_b32 v13, s12 -; GFX11-NEXT: v_dual_mov_b32 v12, s11 :: v_dual_mov_b32 v11, s10 -; GFX11-NEXT: v_dual_mov_b32 v10, s9 :: v_dual_mov_b32 v9, s8 -; GFX11-NEXT: v_dual_mov_b32 v8, s7 :: v_dual_mov_b32 v7, s6 -; GFX11-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v5, s4 +; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX11-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v5, s4 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s18, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s19, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v0 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: v_dual_mov_b32 v8, s7 :: v_dual_mov_b32 v7, s6 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s18, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s19, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v0 +; GFX11-NEXT: v_dual_mov_b32 v10, s9 :: v_dual_mov_b32 v9, s8 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s18, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s19, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, s18, s1 +; GFX11-NEXT: s_mov_b32 s10, s12 +; GFX11-NEXT: s_mov_b32 s11, s13 +; GFX11-NEXT: s_mov_b32 s12, s14 +; GFX11-NEXT: s_mov_b32 s13, s15 +; GFX11-NEXT: s_mov_b32 s14, s16 +; GFX11-NEXT: s_mov_b32 s15, s17 +; GFX11-NEXT: v_dual_mov_b32 v12, s11 :: v_dual_mov_b32 v11, s10 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s18, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s19, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v15, s14 +; GFX11-NEXT: v_dual_mov_b32 v14, s13 :: v_dual_mov_b32 v13, s12 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, s18, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s19, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, s19, s1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, s18, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, s19, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, s18, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, s19, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, s18, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, s19, s1 ; GFX11-NEXT: global_store_b128 v[0:1], v[1:4], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[5:8], off dlc @@ -1095,22 +1095,22 @@ entry: define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, double %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f64_s_v_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s11, s13 ; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s13, s15 ; GPRIDX-NEXT: s_mov_b32 s14, s16 +; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: v_mov_b32_e32 v17, s15 ; GPRIDX-NEXT: v_mov_b32_e32 v16, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v15, s13 @@ -1144,25 +1144,25 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do ; ; GFX10-LABEL: dyn_insertelement_v8f64_s_v_s: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s15, s17 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 ; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: v_mov_b32_e32 v17, s15 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_mov_b32 s15, s17 ; GFX10-NEXT: s_lshl_b32 m0, s18, 1 +; GFX10-NEXT: v_mov_b32_e32 v17, s15 ; GFX10-NEXT: v_mov_b32_e32 v16, s14 ; GFX10-NEXT: v_mov_b32_e32 v15, s13 ; GFX10-NEXT: v_mov_b32_e32 v14, s12 @@ -1191,25 +1191,25 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do ; ; GFX11-LABEL: dyn_insertelement_v8f64_s_v_s: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s3, s5 -; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: s_mov_b32 s9, s11 -; GFX11-NEXT: s_mov_b32 s11, s13 -; GFX11-NEXT: s_mov_b32 s13, s15 -; GFX11-NEXT: s_mov_b32 s15, s17 ; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s9, s11 ; GFX11-NEXT: s_mov_b32 s10, s12 +; GFX11-NEXT: s_mov_b32 s11, s13 ; GFX11-NEXT: s_mov_b32 s12, s14 +; GFX11-NEXT: s_mov_b32 s13, s15 ; GFX11-NEXT: s_mov_b32 s14, s16 -; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_mov_b32 s15, s17 ; GFX11-NEXT: s_lshl_b32 m0, s18, 1 +; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14 ; GFX11-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12 ; GFX11-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10 ; GFX11-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8 @@ -1303,26 +1303,32 @@ entry: define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, double %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f64_s_v_v: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s11, s13 ; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s13, s15 ; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 +; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 ; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 ; GPRIDX-NEXT: v_mov_b32_e32 v13, s10 ; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 @@ -1331,12 +1337,6 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do ; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 -; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v2 @@ -1371,62 +1371,62 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do ; ; GFX10-LABEL: dyn_insertelement_v8f64_s_v_v: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s15, s17 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: s_mov_b32 s12, s14 -; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: v_mov_b32_e32 v18, s15 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_mov_b32_e32 v17, s14 -; GFX10-NEXT: v_mov_b32_e32 v16, s13 -; GFX10-NEXT: v_mov_b32_e32 v15, s12 -; GFX10-NEXT: v_mov_b32_e32 v14, s11 -; GFX10-NEXT: v_mov_b32_e32 v13, s10 -; GFX10-NEXT: v_mov_b32_e32 v12, s9 -; GFX10-NEXT: v_mov_b32_e32 v11, s8 -; GFX10-NEXT: v_mov_b32_e32 v10, s7 -; GFX10-NEXT: v_mov_b32_e32 v9, s6 -; GFX10-NEXT: v_mov_b32_e32 v8, s5 -; GFX10-NEXT: v_mov_b32_e32 v7, s4 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: v_mov_b32_e32 v6, s3 ; GFX10-NEXT: v_mov_b32_e32 v5, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 +; GFX10-NEXT: v_mov_b32_e32 v8, s5 +; GFX10-NEXT: v_mov_b32_e32 v7, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 7, v2 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: v_mov_b32_e32 v10, s7 +; GFX10-NEXT: v_mov_b32_e32 v9, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 +; GFX10-NEXT: v_mov_b32_e32 v12, s9 +; GFX10-NEXT: v_mov_b32_e32 v11, s8 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: s_mov_b32 s15, s17 +; GFX10-NEXT: v_mov_b32_e32 v14, s11 +; GFX10-NEXT: v_mov_b32_e32 v13, s10 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 +; GFX10-NEXT: v_mov_b32_e32 v18, s15 +; GFX10-NEXT: v_mov_b32_e32 v17, s14 +; GFX10-NEXT: v_mov_b32_e32 v16, s13 +; GFX10-NEXT: v_mov_b32_e32 v15, s12 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v1, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[7:10], off @@ -1439,50 +1439,50 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do ; ; GFX11-LABEL: dyn_insertelement_v8f64_s_v_v: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s3, s5 -; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: s_mov_b32 s9, s11 -; GFX11-NEXT: s_mov_b32 s11, s13 -; GFX11-NEXT: s_mov_b32 s13, s15 -; GFX11-NEXT: s_mov_b32 s15, s17 ; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 -; GFX11-NEXT: s_mov_b32 s6, s8 -; GFX11-NEXT: s_mov_b32 s8, s10 -; GFX11-NEXT: s_mov_b32 s10, s12 -; GFX11-NEXT: s_mov_b32 s12, s14 -; GFX11-NEXT: s_mov_b32 s14, s16 -; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14 -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12 -; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10 -; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8 -; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6 -; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 +; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 +; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 ; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v2 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 +; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8 ; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v0 :: v_dual_cndmask_b32 v8, v8, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1 +; GFX11-NEXT: s_mov_b32 s10, s12 +; GFX11-NEXT: s_mov_b32 s11, s13 +; GFX11-NEXT: s_mov_b32 s12, s14 +; GFX11-NEXT: s_mov_b32 s13, s15 +; GFX11-NEXT: s_mov_b32 s14, s16 +; GFX11-NEXT: s_mov_b32 s15, s17 +; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 +; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14 +; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12 ; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v0 :: v_dual_cndmask_b32 v12, v12, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 7, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v1, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v15, v15, v0 :: v_dual_cndmask_b32 v16, v16, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v1, s1 ; GFX11-NEXT: global_store_b128 v[0:1], v[3:6], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[7:10], off dlc @@ -2564,6 +2564,8 @@ define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_s_v_s(<9 x float> inreg %v ; GFX10-LABEL: dyn_insertelement_v9f32_s_v_s: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -2572,8 +2574,6 @@ define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_s_v_s(<9 x float> inreg %v ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 m0, s11 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -2589,6 +2589,7 @@ define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_s_v_s(<9 x float> inreg %v ; GFX11-LABEL: dyn_insertelement_v9f32_s_v_s: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 @@ -2597,7 +2598,6 @@ define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_s_v_s(<9 x float> inreg %v ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: s_mov_b32 s8, s10 -; GFX11-NEXT: v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: s_mov_b32 m0, s11 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 ; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s4 @@ -2794,6 +2794,8 @@ define amdgpu_ps <10 x float> @dyn_insertelement_v10f32_s_v_s(<10 x float> inreg ; GFX10-LABEL: dyn_insertelement_v10f32_s_v_s: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -2803,8 +2805,6 @@ define amdgpu_ps <10 x float> @dyn_insertelement_v10f32_s_v_s(<10 x float> inreg ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 m0, s12 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -2822,21 +2822,21 @@ define amdgpu_ps <10 x float> @dyn_insertelement_v10f32_s_v_s(<10 x float> inreg ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_dual_mov_b32 v10, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: s_mov_b32 s8, s10 ; GFX11-NEXT: s_mov_b32 s9, s11 -; GFX11-NEXT: v_mov_b32_e32 v10, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_mov_b32 m0, s12 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4 -; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6 -; GFX11-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v8, s8 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_mov_b32_e32 v8, s8 ; GFX11-NEXT: v_movreld_b32_e32 v0, v10 ; GFX11-NEXT: ; return to shader part epilog entry: @@ -3041,6 +3041,8 @@ define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_s_v_s(<11 x float> inreg ; GFX10-LABEL: dyn_insertelement_v11f32_s_v_s: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v11, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -3051,8 +3053,6 @@ define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_s_v_s(<11 x float> inreg ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: v_mov_b32_e32 v11, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 m0, s13 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -3070,6 +3070,7 @@ define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_s_v_s(<11 x float> inreg ; GFX11-LABEL: dyn_insertelement_v11f32_s_v_s: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 @@ -3080,7 +3081,6 @@ define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_s_v_s(<11 x float> inreg ; GFX11-NEXT: s_mov_b32 s8, s10 ; GFX11-NEXT: s_mov_b32 s9, s11 ; GFX11-NEXT: s_mov_b32 s10, s12 -; GFX11-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: s_mov_b32 m0, s13 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 ; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s4 @@ -3304,6 +3304,8 @@ define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_s_v_s(<12 x float> inreg ; GFX10-LABEL: dyn_insertelement_v12f32_s_v_s: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v12, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -3315,8 +3317,6 @@ define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_s_v_s(<12 x float> inreg ; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: v_mov_b32_e32 v12, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 m0, s14 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -3336,8 +3336,10 @@ define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_s_v_s(<12 x float> inreg ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_dual_mov_b32 v12, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 @@ -3346,14 +3348,12 @@ define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_s_v_s(<12 x float> inreg ; GFX11-NEXT: s_mov_b32 s9, s11 ; GFX11-NEXT: s_mov_b32 s10, s12 ; GFX11-NEXT: s_mov_b32 s11, s13 -; GFX11-NEXT: v_mov_b32_e32 v12, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_mov_b32 m0, s14 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4 -; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6 -; GFX11-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v8, s8 -; GFX11-NEXT: v_dual_mov_b32 v11, s11 :: v_dual_mov_b32 v10, s10 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_mov_b32_e32 v10, s10 ; GFX11-NEXT: v_movreld_b32_e32 v0, v12 ; GFX11-NEXT: ; return to shader part epilog entry: @@ -4075,22 +4075,22 @@ entry: define amdgpu_ps <16 x i32> @dyn_insertelement_v16i32_s_v_s(<16 x i32> inreg %vec, i32 %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v16i32_s_v_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s11, s13 ; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s13, s15 ; GPRIDX-NEXT: s_mov_b32 s14, s16 +; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: v_mov_b32_e32 v16, s15 ; GPRIDX-NEXT: v_mov_b32_e32 v15, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s13 @@ -4130,25 +4130,25 @@ define amdgpu_ps <16 x i32> @dyn_insertelement_v16i32_s_v_s(<16 x i32> inreg %ve ; ; GFX10-LABEL: dyn_insertelement_v16i32_s_v_s: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s15, s17 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 ; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: v_mov_b32_e32 v16, s15 -; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: s_mov_b32 s15, s17 ; GFX10-NEXT: s_mov_b32 m0, s18 +; GFX10-NEXT: v_mov_b32_e32 v16, s15 ; GFX10-NEXT: v_mov_b32_e32 v15, s14 ; GFX10-NEXT: v_mov_b32_e32 v14, s13 ; GFX10-NEXT: v_mov_b32_e32 v13, s12 @@ -4184,25 +4184,25 @@ define amdgpu_ps <16 x i32> @dyn_insertelement_v16i32_s_v_s(<16 x i32> inreg %ve ; ; GFX11-LABEL: dyn_insertelement_v16i32_s_v_s: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s3, s5 -; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: s_mov_b32 s9, s11 -; GFX11-NEXT: s_mov_b32 s11, s13 -; GFX11-NEXT: s_mov_b32 s13, s15 -; GFX11-NEXT: s_mov_b32 s15, s17 ; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 ; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s9, s11 ; GFX11-NEXT: s_mov_b32 s10, s12 +; GFX11-NEXT: s_mov_b32 s11, s13 ; GFX11-NEXT: s_mov_b32 s12, s14 +; GFX11-NEXT: s_mov_b32 s13, s15 ; GFX11-NEXT: s_mov_b32 s14, s16 -; GFX11-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v15, s14 -; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: s_mov_b32 s15, s17 ; GFX11-NEXT: s_mov_b32 m0, s18 +; GFX11-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v15, s14 ; GFX11-NEXT: v_dual_mov_b32 v14, s13 :: v_dual_mov_b32 v13, s12 ; GFX11-NEXT: v_dual_mov_b32 v12, s11 :: v_dual_mov_b32 v11, s10 ; GFX11-NEXT: v_dual_mov_b32 v10, s9 :: v_dual_mov_b32 v9, s8 @@ -4276,6 +4276,8 @@ define amdgpu_ps <16 x float> @dyn_insertelement_v16f32_s_v_s(<16 x float> inreg ; GFX10-LABEL: dyn_insertelement_v16f32_s_v_s: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v16, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -4291,8 +4293,6 @@ define amdgpu_ps <16 x float> @dyn_insertelement_v16f32_s_v_s(<16 x float> inreg ; GFX10-NEXT: s_mov_b32 s13, s15 ; GFX10-NEXT: s_mov_b32 s14, s16 ; GFX10-NEXT: s_mov_b32 s15, s17 -; GFX10-NEXT: v_mov_b32_e32 v16, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 m0, s18 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -4316,8 +4316,10 @@ define amdgpu_ps <16 x float> @dyn_insertelement_v16f32_s_v_s(<16 x float> inreg ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 @@ -4330,16 +4332,14 @@ define amdgpu_ps <16 x float> @dyn_insertelement_v16f32_s_v_s(<16 x float> inreg ; GFX11-NEXT: s_mov_b32 s13, s15 ; GFX11-NEXT: s_mov_b32 s14, s16 ; GFX11-NEXT: s_mov_b32 s15, s17 -; GFX11-NEXT: v_mov_b32_e32 v16, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_mov_b32 m0, s18 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4 -; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6 -; GFX11-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v8, s8 -; GFX11-NEXT: v_dual_mov_b32 v11, s11 :: v_dual_mov_b32 v10, s10 -; GFX11-NEXT: v_dual_mov_b32 v13, s13 :: v_dual_mov_b32 v12, s12 -; GFX11-NEXT: v_dual_mov_b32 v15, s15 :: v_dual_mov_b32 v14, s14 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_mov_b32_e32 v14, s14 ; GFX11-NEXT: v_movreld_b32_e32 v0, v16 ; GFX11-NEXT: ; return to shader part epilog entry: @@ -4423,6 +4423,8 @@ define amdgpu_ps <32 x float> @dyn_insertelement_v32f32_s_v_s(<32 x float> inreg ; GFX10-LABEL: dyn_insertelement_v32f32_s_v_s: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v32, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -4454,8 +4456,6 @@ define amdgpu_ps <32 x float> @dyn_insertelement_v32f32_s_v_s(<32 x float> inreg ; GFX10-NEXT: s_mov_b32 s29, s31 ; GFX10-NEXT: s_mov_b32 s31, s33 ; GFX10-NEXT: s_mov_b32 s30, s32 -; GFX10-NEXT: v_mov_b32_e32 v32, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_mov_b32 m0, s34 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 @@ -4495,8 +4495,10 @@ define amdgpu_ps <32 x float> @dyn_insertelement_v32f32_s_v_s(<32 x float> inreg ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_dual_mov_b32 v32, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 @@ -4525,24 +4527,22 @@ define amdgpu_ps <32 x float> @dyn_insertelement_v32f32_s_v_s(<32 x float> inreg ; GFX11-NEXT: s_mov_b32 s29, s31 ; GFX11-NEXT: s_mov_b32 s31, s33 ; GFX11-NEXT: s_mov_b32 s30, s32 -; GFX11-NEXT: v_mov_b32_e32 v32, v0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_mov_b32 m0, s34 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4 -; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6 -; GFX11-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v8, s8 -; GFX11-NEXT: v_dual_mov_b32 v11, s11 :: v_dual_mov_b32 v10, s10 -; GFX11-NEXT: v_dual_mov_b32 v13, s13 :: v_dual_mov_b32 v12, s12 -; GFX11-NEXT: v_dual_mov_b32 v15, s15 :: v_dual_mov_b32 v14, s14 -; GFX11-NEXT: v_dual_mov_b32 v17, s17 :: v_dual_mov_b32 v16, s16 -; GFX11-NEXT: v_dual_mov_b32 v19, s19 :: v_dual_mov_b32 v18, s18 -; GFX11-NEXT: v_dual_mov_b32 v21, s21 :: v_dual_mov_b32 v20, s20 -; GFX11-NEXT: v_dual_mov_b32 v23, s23 :: v_dual_mov_b32 v22, s22 -; GFX11-NEXT: v_dual_mov_b32 v25, s25 :: v_dual_mov_b32 v24, s24 -; GFX11-NEXT: v_dual_mov_b32 v27, s27 :: v_dual_mov_b32 v26, s26 -; GFX11-NEXT: v_dual_mov_b32 v29, s29 :: v_dual_mov_b32 v28, s28 -; GFX11-NEXT: v_dual_mov_b32 v31, s31 :: v_dual_mov_b32 v30, s30 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v9, s9 +; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s11 +; GFX11-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s13 +; GFX11-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s15 +; GFX11-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v17, s17 +; GFX11-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v19, s19 +; GFX11-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v21, s21 +; GFX11-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v23, s23 +; GFX11-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v25, s25 +; GFX11-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v27, s27 +; GFX11-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v29, s29 +; GFX11-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v31, s31 +; GFX11-NEXT: v_mov_b32_e32 v30, s30 ; GFX11-NEXT: v_movreld_b32_e32 v0, v32 ; GFX11-NEXT: ; return to shader part epilog entry: @@ -4553,40 +4553,40 @@ entry: define amdgpu_ps <16 x i64> @dyn_insertelement_v16i64_s_v_s(<16 x i64> inreg %vec, i64 %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v16i64_s_v_s: ; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 ; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 ; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 ; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s8, s10 ; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: s_mov_b32 s10, s12 ; GPRIDX-NEXT: s_mov_b32 s11, s13 +; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s13, s15 +; GPRIDX-NEXT: s_mov_b32 s14, s16 ; GPRIDX-NEXT: s_mov_b32 s15, s17 +; GPRIDX-NEXT: s_mov_b32 s16, s18 ; GPRIDX-NEXT: s_mov_b32 s17, s19 +; GPRIDX-NEXT: s_mov_b32 s18, s20 ; GPRIDX-NEXT: s_mov_b32 s19, s21 +; GPRIDX-NEXT: s_mov_b32 s20, s22 ; GPRIDX-NEXT: s_mov_b32 s21, s23 +; GPRIDX-NEXT: s_mov_b32 s22, s24 ; GPRIDX-NEXT: s_mov_b32 s23, s25 +; GPRIDX-NEXT: s_mov_b32 s24, s26 ; GPRIDX-NEXT: s_mov_b32 s25, s27 +; GPRIDX-NEXT: s_mov_b32 s26, s28 ; GPRIDX-NEXT: s_mov_b32 s27, s29 +; GPRIDX-NEXT: s_mov_b32 s28, s30 ; GPRIDX-NEXT: s_mov_b32 s29, s31 ; GPRIDX-NEXT: s_mov_b32 s31, s33 -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: s_mov_b32 s10, s12 -; GPRIDX-NEXT: s_mov_b32 s12, s14 -; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: s_mov_b32 s16, s18 -; GPRIDX-NEXT: s_mov_b32 s18, s20 -; GPRIDX-NEXT: s_mov_b32 s20, s22 -; GPRIDX-NEXT: s_mov_b32 s22, s24 -; GPRIDX-NEXT: s_mov_b32 s24, s26 -; GPRIDX-NEXT: s_mov_b32 s26, s28 -; GPRIDX-NEXT: s_mov_b32 s28, s30 ; GPRIDX-NEXT: s_mov_b32 s30, s32 -; GPRIDX-NEXT: v_mov_b32_e32 v33, s31 ; GPRIDX-NEXT: s_lshl_b32 s33, s34, 1 +; GPRIDX-NEXT: v_mov_b32_e32 v33, s31 ; GPRIDX-NEXT: v_mov_b32_e32 v32, s30 ; GPRIDX-NEXT: v_mov_b32_e32 v31, s29 ; GPRIDX-NEXT: v_mov_b32_e32 v30, s28 @@ -4658,41 +4658,41 @@ define amdgpu_ps <16 x i64> @dyn_insertelement_v16i64_s_v_s(<16 x i64> inreg %ve ; ; GFX10-LABEL: dyn_insertelement_v16i64_s_v_s: ; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 ; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: s_mov_b32 s14, s16 ; GFX10-NEXT: s_mov_b32 s15, s17 +; GFX10-NEXT: s_mov_b32 s16, s18 ; GFX10-NEXT: s_mov_b32 s17, s19 +; GFX10-NEXT: s_mov_b32 s18, s20 ; GFX10-NEXT: s_mov_b32 s19, s21 +; GFX10-NEXT: s_mov_b32 s20, s22 ; GFX10-NEXT: s_mov_b32 s21, s23 +; GFX10-NEXT: s_mov_b32 s22, s24 ; GFX10-NEXT: s_mov_b32 s23, s25 +; GFX10-NEXT: s_mov_b32 s24, s26 ; GFX10-NEXT: s_mov_b32 s25, s27 +; GFX10-NEXT: s_mov_b32 s26, s28 ; GFX10-NEXT: s_mov_b32 s27, s29 +; GFX10-NEXT: s_mov_b32 s28, s30 ; GFX10-NEXT: s_mov_b32 s29, s31 ; GFX10-NEXT: s_mov_b32 s31, s33 -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: s_mov_b32 s12, s14 -; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: s_mov_b32 s16, s18 -; GFX10-NEXT: s_mov_b32 s18, s20 -; GFX10-NEXT: s_mov_b32 s20, s22 -; GFX10-NEXT: s_mov_b32 s22, s24 -; GFX10-NEXT: s_mov_b32 s24, s26 -; GFX10-NEXT: s_mov_b32 s26, s28 -; GFX10-NEXT: s_mov_b32 s28, s30 ; GFX10-NEXT: s_mov_b32 s30, s32 -; GFX10-NEXT: v_mov_b32_e32 v33, s31 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_lshl_b32 m0, s34, 1 +; GFX10-NEXT: v_mov_b32_e32 v33, s31 ; GFX10-NEXT: v_mov_b32_e32 v32, s30 ; GFX10-NEXT: v_mov_b32_e32 v31, s29 ; GFX10-NEXT: v_mov_b32_e32 v30, s28 @@ -4761,41 +4761,41 @@ define amdgpu_ps <16 x i64> @dyn_insertelement_v16i64_s_v_s(<16 x i64> inreg %ve ; ; GFX11-LABEL: dyn_insertelement_v16i64_s_v_s: ; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s8, s10 ; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: s_mov_b32 s10, s12 ; GFX11-NEXT: s_mov_b32 s11, s13 +; GFX11-NEXT: s_mov_b32 s12, s14 ; GFX11-NEXT: s_mov_b32 s13, s15 +; GFX11-NEXT: s_mov_b32 s14, s16 ; GFX11-NEXT: s_mov_b32 s15, s17 +; GFX11-NEXT: s_mov_b32 s16, s18 ; GFX11-NEXT: s_mov_b32 s17, s19 +; GFX11-NEXT: s_mov_b32 s18, s20 ; GFX11-NEXT: s_mov_b32 s19, s21 +; GFX11-NEXT: s_mov_b32 s20, s22 ; GFX11-NEXT: s_mov_b32 s21, s23 +; GFX11-NEXT: s_mov_b32 s22, s24 ; GFX11-NEXT: s_mov_b32 s23, s25 +; GFX11-NEXT: s_mov_b32 s24, s26 ; GFX11-NEXT: s_mov_b32 s25, s27 +; GFX11-NEXT: s_mov_b32 s26, s28 ; GFX11-NEXT: s_mov_b32 s27, s29 +; GFX11-NEXT: s_mov_b32 s28, s30 ; GFX11-NEXT: s_mov_b32 s29, s31 ; GFX11-NEXT: s_mov_b32 s31, s33 -; GFX11-NEXT: s_mov_b32 s0, s2 -; GFX11-NEXT: s_mov_b32 s2, s4 -; GFX11-NEXT: s_mov_b32 s4, s6 -; GFX11-NEXT: s_mov_b32 s6, s8 -; GFX11-NEXT: s_mov_b32 s8, s10 -; GFX11-NEXT: s_mov_b32 s10, s12 -; GFX11-NEXT: s_mov_b32 s12, s14 -; GFX11-NEXT: s_mov_b32 s14, s16 -; GFX11-NEXT: s_mov_b32 s16, s18 -; GFX11-NEXT: s_mov_b32 s18, s20 -; GFX11-NEXT: s_mov_b32 s20, s22 -; GFX11-NEXT: s_mov_b32 s22, s24 -; GFX11-NEXT: s_mov_b32 s24, s26 -; GFX11-NEXT: s_mov_b32 s26, s28 -; GFX11-NEXT: s_mov_b32 s28, s30 ; GFX11-NEXT: s_mov_b32 s30, s32 -; GFX11-NEXT: v_dual_mov_b32 v33, s31 :: v_dual_mov_b32 v32, s30 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: s_lshl_b32 m0, s34, 1 +; GFX11-NEXT: v_dual_mov_b32 v33, s31 :: v_dual_mov_b32 v32, s30 ; GFX11-NEXT: v_dual_mov_b32 v31, s29 :: v_dual_mov_b32 v30, s28 ; GFX11-NEXT: v_dual_mov_b32 v29, s27 :: v_dual_mov_b32 v28, s26 ; GFX11-NEXT: v_dual_mov_b32 v27, s25 :: v_dual_mov_b32 v26, s24 @@ -4853,40 +4853,40 @@ entry: define amdgpu_ps <16 x double> @dyn_insertelement_v16f64_s_v_s(<16 x double> inreg %vec, double %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v16f64_s_v_s: ; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 ; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 ; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 ; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s8, s10 ; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: s_mov_b32 s10, s12 ; GPRIDX-NEXT: s_mov_b32 s11, s13 +; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s13, s15 +; GPRIDX-NEXT: s_mov_b32 s14, s16 ; GPRIDX-NEXT: s_mov_b32 s15, s17 +; GPRIDX-NEXT: s_mov_b32 s16, s18 ; GPRIDX-NEXT: s_mov_b32 s17, s19 +; GPRIDX-NEXT: s_mov_b32 s18, s20 ; GPRIDX-NEXT: s_mov_b32 s19, s21 +; GPRIDX-NEXT: s_mov_b32 s20, s22 ; GPRIDX-NEXT: s_mov_b32 s21, s23 +; GPRIDX-NEXT: s_mov_b32 s22, s24 ; GPRIDX-NEXT: s_mov_b32 s23, s25 +; GPRIDX-NEXT: s_mov_b32 s24, s26 ; GPRIDX-NEXT: s_mov_b32 s25, s27 +; GPRIDX-NEXT: s_mov_b32 s26, s28 ; GPRIDX-NEXT: s_mov_b32 s27, s29 +; GPRIDX-NEXT: s_mov_b32 s28, s30 ; GPRIDX-NEXT: s_mov_b32 s29, s31 ; GPRIDX-NEXT: s_mov_b32 s31, s33 -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: s_mov_b32 s10, s12 -; GPRIDX-NEXT: s_mov_b32 s12, s14 -; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: s_mov_b32 s16, s18 -; GPRIDX-NEXT: s_mov_b32 s18, s20 -; GPRIDX-NEXT: s_mov_b32 s20, s22 -; GPRIDX-NEXT: s_mov_b32 s22, s24 -; GPRIDX-NEXT: s_mov_b32 s24, s26 -; GPRIDX-NEXT: s_mov_b32 s26, s28 -; GPRIDX-NEXT: s_mov_b32 s28, s30 ; GPRIDX-NEXT: s_mov_b32 s30, s32 -; GPRIDX-NEXT: v_mov_b32_e32 v33, s31 ; GPRIDX-NEXT: s_lshl_b32 s33, s34, 1 +; GPRIDX-NEXT: v_mov_b32_e32 v33, s31 ; GPRIDX-NEXT: v_mov_b32_e32 v32, s30 ; GPRIDX-NEXT: v_mov_b32_e32 v31, s29 ; GPRIDX-NEXT: v_mov_b32_e32 v30, s28 @@ -4958,41 +4958,41 @@ define amdgpu_ps <16 x double> @dyn_insertelement_v16f64_s_v_s(<16 x double> inr ; ; GFX10-LABEL: dyn_insertelement_v16f64_s_v_s: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s15, s17 -; GFX10-NEXT: s_mov_b32 s17, s19 -; GFX10-NEXT: s_mov_b32 s19, s21 -; GFX10-NEXT: s_mov_b32 s21, s23 -; GFX10-NEXT: s_mov_b32 s23, s25 -; GFX10-NEXT: s_mov_b32 s25, s27 -; GFX10-NEXT: s_mov_b32 s27, s29 -; GFX10-NEXT: s_mov_b32 s29, s31 -; GFX10-NEXT: s_mov_b32 s31, s33 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 ; GFX10-NEXT: s_mov_b32 s14, s16 +; GFX10-NEXT: s_mov_b32 s15, s17 ; GFX10-NEXT: s_mov_b32 s16, s18 +; GFX10-NEXT: s_mov_b32 s17, s19 ; GFX10-NEXT: s_mov_b32 s18, s20 +; GFX10-NEXT: s_mov_b32 s19, s21 ; GFX10-NEXT: s_mov_b32 s20, s22 +; GFX10-NEXT: s_mov_b32 s21, s23 ; GFX10-NEXT: s_mov_b32 s22, s24 +; GFX10-NEXT: s_mov_b32 s23, s25 ; GFX10-NEXT: s_mov_b32 s24, s26 +; GFX10-NEXT: s_mov_b32 s25, s27 ; GFX10-NEXT: s_mov_b32 s26, s28 +; GFX10-NEXT: s_mov_b32 s27, s29 ; GFX10-NEXT: s_mov_b32 s28, s30 +; GFX10-NEXT: s_mov_b32 s29, s31 +; GFX10-NEXT: s_mov_b32 s31, s33 ; GFX10-NEXT: s_mov_b32 s30, s32 -; GFX10-NEXT: v_mov_b32_e32 v33, s31 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_lshl_b32 m0, s34, 1 +; GFX10-NEXT: v_mov_b32_e32 v33, s31 ; GFX10-NEXT: v_mov_b32_e32 v32, s30 ; GFX10-NEXT: v_mov_b32_e32 v31, s29 ; GFX10-NEXT: v_mov_b32_e32 v30, s28 @@ -5061,41 +5061,41 @@ define amdgpu_ps <16 x double> @dyn_insertelement_v16f64_s_v_s(<16 x double> inr ; ; GFX11-LABEL: dyn_insertelement_v16f64_s_v_s: ; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s8, s10 ; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: s_mov_b32 s10, s12 ; GFX11-NEXT: s_mov_b32 s11, s13 +; GFX11-NEXT: s_mov_b32 s12, s14 ; GFX11-NEXT: s_mov_b32 s13, s15 +; GFX11-NEXT: s_mov_b32 s14, s16 ; GFX11-NEXT: s_mov_b32 s15, s17 +; GFX11-NEXT: s_mov_b32 s16, s18 ; GFX11-NEXT: s_mov_b32 s17, s19 +; GFX11-NEXT: s_mov_b32 s18, s20 ; GFX11-NEXT: s_mov_b32 s19, s21 +; GFX11-NEXT: s_mov_b32 s20, s22 ; GFX11-NEXT: s_mov_b32 s21, s23 +; GFX11-NEXT: s_mov_b32 s22, s24 ; GFX11-NEXT: s_mov_b32 s23, s25 +; GFX11-NEXT: s_mov_b32 s24, s26 ; GFX11-NEXT: s_mov_b32 s25, s27 +; GFX11-NEXT: s_mov_b32 s26, s28 ; GFX11-NEXT: s_mov_b32 s27, s29 +; GFX11-NEXT: s_mov_b32 s28, s30 ; GFX11-NEXT: s_mov_b32 s29, s31 ; GFX11-NEXT: s_mov_b32 s31, s33 -; GFX11-NEXT: s_mov_b32 s0, s2 -; GFX11-NEXT: s_mov_b32 s2, s4 -; GFX11-NEXT: s_mov_b32 s4, s6 -; GFX11-NEXT: s_mov_b32 s6, s8 -; GFX11-NEXT: s_mov_b32 s8, s10 -; GFX11-NEXT: s_mov_b32 s10, s12 -; GFX11-NEXT: s_mov_b32 s12, s14 -; GFX11-NEXT: s_mov_b32 s14, s16 -; GFX11-NEXT: s_mov_b32 s16, s18 -; GFX11-NEXT: s_mov_b32 s18, s20 -; GFX11-NEXT: s_mov_b32 s20, s22 -; GFX11-NEXT: s_mov_b32 s22, s24 -; GFX11-NEXT: s_mov_b32 s24, s26 -; GFX11-NEXT: s_mov_b32 s26, s28 -; GFX11-NEXT: s_mov_b32 s28, s30 ; GFX11-NEXT: s_mov_b32 s30, s32 -; GFX11-NEXT: v_dual_mov_b32 v33, s31 :: v_dual_mov_b32 v32, s30 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: s_lshl_b32 m0, s34, 1 +; GFX11-NEXT: v_dual_mov_b32 v33, s31 :: v_dual_mov_b32 v32, s30 ; GFX11-NEXT: v_dual_mov_b32 v31, s29 :: v_dual_mov_b32 v30, s28 ; GFX11-NEXT: v_dual_mov_b32 v29, s27 :: v_dual_mov_b32 v28, s26 ; GFX11-NEXT: v_dual_mov_b32 v27, s25 :: v_dual_mov_b32 v26, s24 @@ -5498,8 +5498,6 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_s(<7 x double> inreg ; GPRIDX-NEXT: s_mov_b32 s11, s13 ; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v15, s13 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s12 ; GPRIDX-NEXT: v_mov_b32_e32 v13, s11 @@ -5539,6 +5537,7 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_s(<7 x double> inreg ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 @@ -5551,10 +5550,7 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_s(<7 x double> inreg ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: s_mov_b32 s12, s14 ; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: v_mov_b32_e32 v17, s15 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_lshl_b32 m0, s16, 1 -; GFX10-NEXT: v_mov_b32_e32 v16, s14 ; GFX10-NEXT: v_mov_b32_e32 v15, s13 ; GFX10-NEXT: v_mov_b32_e32 v14, s12 ; GFX10-NEXT: v_mov_b32_e32 v13, s11 @@ -5590,6 +5586,7 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_s(<7 x double> inreg ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 @@ -5602,8 +5599,6 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_s(<7 x double> inreg ; GFX11-NEXT: s_mov_b32 s11, s13 ; GFX11-NEXT: s_mov_b32 s12, s14 ; GFX11-NEXT: s_mov_b32 s13, s15 -; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: s_lshl_b32 m0, s16, 1 ; GFX11-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12 ; GFX11-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10 @@ -5650,8 +5645,6 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg ; GPRIDX-NEXT: s_mov_b32 s11, s13 ; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 ; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 @@ -5709,57 +5702,55 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, s2 +; GFX10-NEXT: v_mov_b32_e32 v6, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v9, s6 +; GFX10-NEXT: v_mov_b32_e32 v10, s7 +; GFX10-NEXT: v_mov_b32_e32 v8, s5 +; GFX10-NEXT: v_mov_b32_e32 v7, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: s_mov_b32 s12, s14 ; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: v_mov_b32_e32 v18, s15 -; GFX10-NEXT: v_mov_b32_e32 v17, s14 -; GFX10-NEXT: v_mov_b32_e32 v16, s13 ; GFX10-NEXT: v_mov_b32_e32 v15, s12 +; GFX10-NEXT: v_mov_b32_e32 v16, s13 ; GFX10-NEXT: v_mov_b32_e32 v14, s11 ; GFX10-NEXT: v_mov_b32_e32 v13, s10 ; GFX10-NEXT: v_mov_b32_e32 v12, s9 ; GFX10-NEXT: v_mov_b32_e32 v11, s8 -; GFX10-NEXT: v_mov_b32_e32 v10, s7 -; GFX10-NEXT: v_mov_b32_e32 v9, s6 -; GFX10-NEXT: v_mov_b32_e32 v8, s5 -; GFX10-NEXT: v_mov_b32_e32 v7, s4 -; GFX10-NEXT: v_mov_b32_e32 v6, s3 -; GFX10-NEXT: v_mov_b32_e32 v5, s2 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 6, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 6, v2 ; GFX10-NEXT: v_readfirstlane_b32 s2, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, v12, v1, vcc_lo -; GFX10-NEXT: v_readfirstlane_b32 s3, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v12, v13, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v13, v14, v1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v0, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v16, v1, s1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v3 ; GFX10-NEXT: v_readfirstlane_b32 s1, v4 +; GFX10-NEXT: v_readfirstlane_b32 s3, v6 ; GFX10-NEXT: v_readfirstlane_b32 s4, v7 ; GFX10-NEXT: v_readfirstlane_b32 s5, v8 ; GFX10-NEXT: v_readfirstlane_b32 s6, v9 @@ -5778,46 +5769,45 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_dual_mov_b32 v5, s2 :: v_dual_mov_b32 v6, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: v_dual_mov_b32 v9, s6 :: v_dual_mov_b32 v10, s7 +; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 +; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 ; GFX11-NEXT: s_mov_b32 s8, s10 ; GFX11-NEXT: s_mov_b32 s9, s11 ; GFX11-NEXT: s_mov_b32 s10, s12 ; GFX11-NEXT: s_mov_b32 s11, s13 ; GFX11-NEXT: s_mov_b32 s12, s14 ; GFX11-NEXT: s_mov_b32 s13, s15 -; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14 -; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12 +; GFX11-NEXT: v_dual_mov_b32 v15, s12 :: v_dual_mov_b32 v16, s13 ; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10 ; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8 -; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6 -; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 -; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 6, v2 -; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 ; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v0 :: v_dual_cndmask_b32 v8, v8, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 6, v2 ; GFX11-NEXT: v_readfirstlane_b32 s2, v5 ; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v0 :: v_dual_cndmask_b32 v2, v12, v1 -; GFX11-NEXT: v_readfirstlane_b32 s3, v6 ; GFX11-NEXT: v_cndmask_b32_e64 v12, v13, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v13, v14, v1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v15, v0, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v16, v1, s1 ; GFX11-NEXT: v_readfirstlane_b32 s0, v3 ; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: v_readfirstlane_b32 s3, v6 ; GFX11-NEXT: v_readfirstlane_b32 s4, v7 ; GFX11-NEXT: v_readfirstlane_b32 s5, v8 ; GFX11-NEXT: v_readfirstlane_b32 s6, v9 @@ -6040,35 +6030,35 @@ entry: define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg %vec, double %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v5f64_s_v_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: v_mov_b32_e32 v11, s9 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s0 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 0 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: v_mov_b32_e32 v5, s3 ; GPRIDX-NEXT: v_mov_b32_e32 v4, s2 ; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v6, s4 ; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 2 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: v_mov_b32_e32 v9, s7 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s6 ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 3 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s9 ; GPRIDX-NEXT: v_mov_b32_e32 v10, s8 ; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v1, vcc @@ -6089,35 +6079,35 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg ; ; GFX10-LABEL: dyn_insertelement_v5f64_s_v_s: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s12, 1 ; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: v_mov_b32_e32 v11, s9 +; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: v_mov_b32_e32 v10, s8 +; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; GFX10-NEXT: v_mov_b32_e32 v9, s7 ; GFX10-NEXT: v_mov_b32_e32 v8, s6 ; GFX10-NEXT: v_mov_b32_e32 v7, s5 ; GFX10-NEXT: v_mov_b32_e32 v6, s4 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 -; GFX10-NEXT: v_mov_b32_e32 v4, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s12, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s12, 4 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v0, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 2 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v1, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s12, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s12, 4 ; GFX10-NEXT: v_readfirstlane_b32 s2, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc_lo @@ -6138,29 +6128,29 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg ; ; GFX11-LABEL: dyn_insertelement_v5f64_s_v_s: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s3, s5 -; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: s_mov_b32 s9, s11 ; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s12, 1 ; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: s_mov_b32 s8, s10 -; GFX11-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v11, s9 ; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 ; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 -; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s12, 1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s12, 4 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v0 :: v_dual_cndmask_b32 v3, v3, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v0, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 2 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v1, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s12, 3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s12, 4 ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 ; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v0 :: v_dual_cndmask_b32 v7, v7, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v0, s0 @@ -6185,30 +6175,30 @@ entry: define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg %vec, double %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v5f64_s_v_v: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 ; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 ; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 ; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 ; GPRIDX-NEXT: v_mov_b32_e32 v10, s7 ; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 @@ -6234,35 +6224,35 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg ; ; GFX10-LABEL: dyn_insertelement_v5f64_s_v_v: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, s2 +; GFX10-NEXT: v_mov_b32_e32 v6, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: v_mov_b32_e32 v12, s9 +; GFX10-NEXT: s_mov_b32 s9, s11 ; GFX10-NEXT: v_mov_b32_e32 v11, s8 +; GFX10-NEXT: v_mov_b32_e32 v12, s9 ; GFX10-NEXT: v_mov_b32_e32 v10, s7 ; GFX10-NEXT: v_mov_b32_e32 v9, s6 ; GFX10-NEXT: v_mov_b32_e32 v8, s5 ; GFX10-NEXT: v_mov_b32_e32 v7, s4 -; GFX10-NEXT: v_mov_b32_e32 v6, s3 -; GFX10-NEXT: v_mov_b32_e32 v5, s2 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 4, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 4, v2 ; GFX10-NEXT: v_readfirstlane_b32 s2, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc_lo @@ -6283,29 +6273,29 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg ; ; GFX11-LABEL: dyn_insertelement_v5f64_s_v_v: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s3, s5 -; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: s_mov_b32 s9, s11 ; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_dual_mov_b32 v5, s2 :: v_dual_mov_b32 v6, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 ; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: s_mov_b32 s8, s10 -; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_mov_b32 v12, s9 ; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6 ; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 -; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 4, v2 ; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 4, v2 ; GFX11-NEXT: v_readfirstlane_b32 s2, v5 ; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v0 :: v_dual_cndmask_b32 v2, v8, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v8, v9, v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll index 588802cbd56c7..272dfaf59848a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll @@ -34,8 +34,8 @@ define amdgpu_kernel void @fcmp_uniform_select(float %a, i32 %b, i32 %c, ptr add ; GFX8-NEXT: s_and_b32 s0, s0, 1 ; GFX8-NEXT: s_cmp_lg_u32 s0, 0 ; GFX8-NEXT: s_cselect_b32 s0, s1, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll index 7fd981c3f3fc6..fa0030c566743 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -17,8 +17,8 @@ define amdgpu_kernel void @use_lds_globals(ptr addrspace(1) %out, ptr addrspace( ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_add_u32 s0, s0, 4 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-NEXT: v_mov_b32_e32 v3, 9 ; CHECK-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll index 6846137272ec6..bb44bd0be28af 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -194,11 +194,11 @@ define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double ; GFX7-NEXT: s_cmp_eq_u32 s6, 0 ; GFX7-NEXT: s_cselect_b32 s6, 1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_and_b32 s0, 1, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: s_and_b32 s0, 1, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX7-NEXT: s_nop 3 @@ -212,11 +212,11 @@ define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double ; GFX8-NEXT: s_cmp_eq_u32 s6, 0 ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_and_b32 s0, 1, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_and_b32 s0, 1, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: s_nop 3 @@ -230,9 +230,9 @@ define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double ; GFX10_W32-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10_W32-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10_W32-NEXT: v_mov_b32_e32 v2, s4 -; GFX10_W32-NEXT: s_and_b32 s6, 1, s6 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX10_W32-NEXT: s_and_b32 s6, 1, s6 +; GFX10_W32-NEXT: v_mov_b32_e32 v2, s4 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 ; GFX10_W32-NEXT: v_mov_b32_e32 v3, s5 ; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3] @@ -245,9 +245,9 @@ define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double ; GFX10_W64-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10_W64-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10_W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10_W64-NEXT: s_and_b32 s6, 1, s6 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 +; GFX10_W64-NEXT: s_and_b32 s6, 1, s6 +; GFX10_W64-NEXT: v_mov_b32_e32 v2, s4 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 ; GFX10_W64-NEXT: v_mov_b32_e32 v3, s5 ; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3] @@ -273,9 +273,9 @@ define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double ; GFX11_W64-NEXT: s_cmp_eq_u32 s6, 0 ; GFX11_W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX11_W64-NEXT: s_cselect_b32 s6, 1, 0 -; GFX11_W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11_W64-NEXT: s_and_b32 s6, 1, s6 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, s3 +; GFX11_W64-NEXT: s_and_b32 s6, 1, s6 +; GFX11_W64-NEXT: v_mov_b32_e32 v2, s4 ; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 ; GFX11_W64-NEXT: v_mov_b32_e32 v3, s5 ; GFX11_W64-NEXT: v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3] @@ -729,11 +729,11 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX7-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-NEXT: v_mov_b32_e32 v2, s12 -; GFX7-NEXT: v_mov_b32_e32 v4, s14 ; GFX7-NEXT: s_and_b32 s0, 1, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-NEXT: v_mov_b32_e32 v2, s12 ; GFX7-NEXT: v_mov_b32_e32 v3, s13 +; GFX7-NEXT: v_mov_b32_e32 v4, s14 ; GFX7-NEXT: v_mov_b32_e32 v5, s15 ; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX7-NEXT: s_mov_b32 s10, -1 @@ -749,11 +749,11 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x20 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NEXT: s_and_b32 s0, 1, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NEXT: v_mov_b32_e32 v5, s15 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: s_nop 3 @@ -771,9 +771,9 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: s_and_b32 s0, 1, s0 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s12 -; GFX10_W32-NEXT: v_mov_b32_e32 v2, s14 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s13 +; GFX10_W32-NEXT: v_mov_b32_e32 v2, s14 ; GFX10_W32-NEXT: v_mov_b32_e32 v3, s15 ; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[10:11], v[0:1], v[2:3] ; GFX10_W32-NEXT: v_mov_b32_e32 v2, 0 @@ -788,9 +788,9 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: s_and_b32 s0, 1, s0 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s12 -; GFX10_W64-NEXT: v_mov_b32_e32 v2, s14 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s13 +; GFX10_W64-NEXT: v_mov_b32_e32 v2, s14 ; GFX10_W64-NEXT: v_mov_b32_e32 v3, s15 ; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[10:11], v[0:1], v[2:3] ; GFX10_W64-NEXT: v_mov_b32_e32 v2, 0 @@ -820,9 +820,9 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W64-NEXT: s_and_b32 s8, 1, s8 ; GFX11_W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX11_W64-NEXT: v_mov_b32_e32 v2, s6 ; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, s5 +; GFX11_W64-NEXT: v_mov_b32_e32 v2, s6 ; GFX11_W64-NEXT: v_mov_b32_e32 v3, s7 ; GFX11_W64-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] ; GFX11_W64-NEXT: v_mov_b32_e32 v2, 0 @@ -1208,9 +1208,9 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2 ; GFX8-NEXT: s_and_b64 vcc, vcc, s[2:3] -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -1368,8 +1368,8 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_dwordx3 v[1:3], v[1:2] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll index 0535394d1025c..c3e9df721f368 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -379,10 +379,10 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -453,10 +453,10 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s4, v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -527,10 +527,10 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s4, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -601,10 +601,10 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1213,10 +1213,10 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, 1.0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1282,10 +1282,10 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], 2.0, 2.0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll index 2d0d04e1b533e..898c5794414fe 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll @@ -112,19 +112,20 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 -; GFX11-NEXT: v_mov_b32_e32 v11, v7 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v2, v9 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v11, v7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[10:11] @@ -137,17 +138,19 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr ; GFX12-NEXT: s_mov_b32 s0, s2 ; GFX12-NEXT: s_mov_b32 s1, s3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v9, v7 -; GFX12-NEXT: v_dual_mov_b32 v10, v7 :: v_dual_mov_b32 v11, v7 +; GFX12-NEXT: v_mov_b32_e32 v9, v7 ; GFX12-NEXT: s_mov_b32 s2, s4 ; GFX12-NEXT: s_mov_b32 s3, s5 ; GFX12-NEXT: s_mov_b32 s4, s6 ; GFX12-NEXT: s_mov_b32 s5, s7 +; GFX12-NEXT: v_mov_b32_e32 v2, v9 +; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v11, v7 +; GFX12-NEXT: v_mov_b32_e32 v10, v7 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: v_mov_b32_e32 v4, v11 +; GFX12-NEXT: v_dual_mov_b32 v3, v10 :: v_dual_mov_b32 v4, v11 ; GFX12-NEXT: image_load v[0:4], [v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v7, v4, s[10:11] @@ -221,19 +224,20 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 -; GFX11-NEXT: v_mov_b32_e32 v11, v7 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v2, v9 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v11, v7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[10:11] @@ -246,17 +250,19 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX12-NEXT: s_mov_b32 s0, s2 ; GFX12-NEXT: s_mov_b32 s1, s3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v9, v7 -; GFX12-NEXT: v_dual_mov_b32 v10, v7 :: v_dual_mov_b32 v11, v7 +; GFX12-NEXT: v_mov_b32_e32 v9, v7 ; GFX12-NEXT: s_mov_b32 s2, s4 ; GFX12-NEXT: s_mov_b32 s3, s5 ; GFX12-NEXT: s_mov_b32 s4, s6 ; GFX12-NEXT: s_mov_b32 s5, s7 +; GFX12-NEXT: v_mov_b32_e32 v2, v9 +; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v11, v7 +; GFX12-NEXT: v_mov_b32_e32 v10, v7 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: v_mov_b32_e32 v4, v11 +; GFX12-NEXT: v_dual_mov_b32 v3, v10 :: v_dual_mov_b32 v4, v11 ; GFX12-NEXT: image_load v[0:4], [v5, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v7, v4, s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll index 676bd8856ce6e..bf3262deca0c6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll @@ -66,15 +66,13 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r ; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v8, v5 ; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -83,6 +81,8 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-NEXT: v_mov_b32_e32 v2, v7 ; GFX9-NEXT: v_mov_b32_e32 v3, v8 @@ -190,15 +190,13 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre ; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v8, v5 ; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -207,6 +205,8 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-NEXT: v_mov_b32_e32 v2, v7 ; GFX9-NEXT: v_mov_b32_e32 v3, v8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll index a101a15ea8140..138efac05f041 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll @@ -112,24 +112,24 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r ; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v9, 0 -; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v8, v3 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v10, v9 ; GFX11-NEXT: v_mov_b32_e32 v11, v9 -; GFX11-NEXT: v_mov_b32_e32 v12, v9 -; GFX11-NEXT: v_mov_b32_e32 v13, v9 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_mov_b32_e32 v2, v11 +; GFX11-NEXT: v_mov_b32_e32 v12, v9 +; GFX11-NEXT: v_mov_b32_e32 v10, v9 +; GFX11-NEXT: v_mov_b32_e32 v13, v9 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 -; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 -; GFX11-NEXT: v_mov_b32_e32 v4, v13 +; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v3, v12 +; GFX11-NEXT: v_dual_mov_b32 v1, v10 :: v_dual_mov_b32 v4, v13 ; GFX11-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v9, v4, s[10:11] @@ -138,22 +138,23 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r ; GFX12-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v9, 0 -; GFX12-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX12-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v8, v3 +; GFX12-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v11, v9 -; GFX12-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v9 +; GFX12-NEXT: v_mov_b32_e32 v11, v9 ; GFX12-NEXT: s_mov_b32 s0, s2 ; GFX12-NEXT: s_mov_b32 s1, s3 ; GFX12-NEXT: s_mov_b32 s2, s4 ; GFX12-NEXT: s_mov_b32 s3, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, v11 +; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v13, v9 +; GFX12-NEXT: v_mov_b32_e32 v12, v9 ; GFX12-NEXT: s_mov_b32 s4, s6 ; GFX12-NEXT: s_mov_b32 s5, s7 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 -; GFX12-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 -; GFX12-NEXT: v_mov_b32_e32 v4, v13 +; GFX12-NEXT: v_dual_mov_b32 v3, v12 :: v_dual_mov_b32 v4, v13 ; GFX12-NEXT: image_load v[0:4], [v5, v6, v7, v8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v9, v4, s[10:11] @@ -227,24 +228,24 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre ; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v9, 0 -; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v8, v3 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v10, v9 ; GFX11-NEXT: v_mov_b32_e32 v11, v9 -; GFX11-NEXT: v_mov_b32_e32 v12, v9 -; GFX11-NEXT: v_mov_b32_e32 v13, v9 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_mov_b32_e32 v2, v11 +; GFX11-NEXT: v_mov_b32_e32 v12, v9 +; GFX11-NEXT: v_mov_b32_e32 v10, v9 +; GFX11-NEXT: v_mov_b32_e32 v13, v9 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 -; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 -; GFX11-NEXT: v_mov_b32_e32 v4, v13 +; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v3, v12 +; GFX11-NEXT: v_dual_mov_b32 v1, v10 :: v_dual_mov_b32 v4, v13 ; GFX11-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v9, v4, s[10:11] @@ -253,22 +254,23 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre ; GFX12-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v9, 0 -; GFX12-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX12-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v8, v3 +; GFX12-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v11, v9 -; GFX12-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v9 +; GFX12-NEXT: v_mov_b32_e32 v11, v9 ; GFX12-NEXT: s_mov_b32 s0, s2 ; GFX12-NEXT: s_mov_b32 s1, s3 ; GFX12-NEXT: s_mov_b32 s2, s4 ; GFX12-NEXT: s_mov_b32 s3, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, v11 +; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v13, v9 +; GFX12-NEXT: v_mov_b32_e32 v12, v9 ; GFX12-NEXT: s_mov_b32 s4, s6 ; GFX12-NEXT: s_mov_b32 s5, s7 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 ; GFX12-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 -; GFX12-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 -; GFX12-NEXT: v_mov_b32_e32 v4, v13 +; GFX12-NEXT: v_dual_mov_b32 v3, v12 :: v_dual_mov_b32 v4, v13 ; GFX12-NEXT: image_load v[0:4], [v5, v6, v7, v8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v9, v4, s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll index b20dc4b539276..f7dc1e4e9d323 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll @@ -59,15 +59,12 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-NEXT: v_mov_b32_e32 v10, v7 ; GFX9-NEXT: v_mov_b32_e32 v11, v7 -; GFX9-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -76,6 +73,9 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-NEXT: v_mov_b32_e32 v1, v8 ; GFX9-NEXT: v_mov_b32_e32 v2, v9 ; GFX9-NEXT: v_mov_b32_e32 v3, v10 @@ -117,23 +117,23 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 -; GFX11-NEXT: v_mov_b32_e32 v9, v6 -; GFX11-NEXT: v_mov_b32_e32 v10, v6 -; GFX11-NEXT: v_mov_b32_e32 v12, v2 -; GFX11-NEXT: v_lshl_or_b32 v11, v1, 16, v0 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v10, v6 +; GFX11-NEXT: v_mov_b32_e32 v12, v2 +; GFX11-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-NEXT: v_mov_b32_e32 v4, v10 ; GFX11-NEXT: image_load v[0:4], v[11:12], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -155,8 +155,8 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr ; GFX12-NEXT: s_mov_b32 s5, s7 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 -; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 ; GFX12-NEXT: v_mov_b32_e32 v4, v10 ; GFX12-NEXT: image_load v[0:4], [v11, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -172,15 +172,12 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-NEXT: v_mov_b32_e32 v10, v7 ; GFX9-NEXT: v_mov_b32_e32 v11, v7 -; GFX9-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -189,6 +186,9 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-NEXT: v_mov_b32_e32 v1, v8 ; GFX9-NEXT: v_mov_b32_e32 v2, v9 ; GFX9-NEXT: v_mov_b32_e32 v3, v10 @@ -230,23 +230,23 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 -; GFX11-NEXT: v_mov_b32_e32 v9, v6 -; GFX11-NEXT: v_mov_b32_e32 v10, v6 -; GFX11-NEXT: v_mov_b32_e32 v12, v2 -; GFX11-NEXT: v_lshl_or_b32 v11, v1, 16, v0 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v10, v6 +; GFX11-NEXT: v_mov_b32_e32 v12, v2 +; GFX11-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-NEXT: v_mov_b32_e32 v4, v10 ; GFX11-NEXT: image_load v[0:4], v[11:12], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -268,8 +268,8 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX12-NEXT: s_mov_b32 s5, s7 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 -; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v3, v9 +; GFX12-NEXT: v_dual_mov_b32 v1, v7 :: v_dual_mov_b32 v2, v8 ; GFX12-NEXT: v_mov_b32_e32 v4, v10 ; GFX12-NEXT: image_load v[0:4], [v11, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll index 7f32d8e6e16b4..add2149a70226 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll @@ -114,8 +114,8 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 ; GFX11-NEXT: s_mov_b32 s2, s4 @@ -124,9 +124,9 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 -; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[10:11] @@ -225,8 +225,8 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 ; GFX11-NEXT: s_mov_b32 s2, s4 @@ -235,9 +235,9 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 -; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll index c1c383eb583aa..48a854a71a088 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll @@ -698,7 +698,6 @@ define amdgpu_ps void @image_store_v4f32_dmask_0110(<8 x i32> inreg %rsrc, i32 % define amdgpu_ps void @image_store_f32_dmask_1111(<8 x i32> inreg %rsrc, i32 inreg %s, i32 inreg %t, float %in) #0 { ; GFX6-LABEL: image_store_f32_dmask_1111: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v1, s10 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -707,13 +706,13 @@ define amdgpu_ps void @image_store_f32_dmask_1111(<8 x i32> inreg %rsrc, i32 inr ; GFX6-NEXT: s_mov_b32 s5, s7 ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v1, s10 ; GFX6-NEXT: v_mov_b32_e32 v2, s11 ; GFX6-NEXT: image_store v0, v[1:2], s[0:7] dmask:0xf unorm ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: image_store_f32_dmask_1111: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v1, s10 ; GFX8-NEXT: s_mov_b32 s0, s2 ; GFX8-NEXT: s_mov_b32 s1, s3 ; GFX8-NEXT: s_mov_b32 s2, s4 @@ -722,6 +721,7 @@ define amdgpu_ps void @image_store_f32_dmask_1111(<8 x i32> inreg %rsrc, i32 inr ; GFX8-NEXT: s_mov_b32 s5, s7 ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 +; GFX8-NEXT: v_mov_b32_e32 v1, s10 ; GFX8-NEXT: v_mov_b32_e32 v2, s11 ; GFX8-NEXT: image_store v0, v[1:2], s[0:7] dmask:0xf unorm ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index c23afeb63a06a..e0fe31e0d1090 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -669,22 +669,24 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_mov_b32 s8, 0x40400000 ; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 ; GFX11-NEXT: s_mov_b32 s9, 4.0 -; GFX11-NEXT: s_mov_b32 s8, 0x40400000 -; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 -; GFX11-NEXT: v_mov_b32_e32 v6, s12 -; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13 +; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX11-NEXT: v_mov_b32_e32 v8, s14 +; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v5, s10 +; GFX11-NEXT: v_mov_b32_e32 v7, s13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: s_mov_b32 s1, 1.0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: s_mov_b32 s2, 2.0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -692,9 +694,9 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s8 +; GFX11-NEXT: v_mov_b32_e32 v4, s9 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -781,15 +783,17 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX11-NEXT: s_mov_b32 s8, 0x42004600 ; GFX11-NEXT: s_mov_b32 s9, 0x44004700 ; GFX11-NEXT: s_mov_b32 s10, 0x45004800 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v5, s10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: s_mov_b32 s1, 1.0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: s_mov_b32 s2, 2.0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -797,9 +801,9 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s8 +; GFX11-NEXT: v_mov_b32_e32 v4, s9 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -891,28 +895,28 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s5, 1.0 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: v_mov_b32_e32 v9, 0xb36211c7 -; GFX11-NEXT: s_mov_b32 s8, 0x40400000 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 ; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 ; GFX11-NEXT: s_mov_b32 s9, 4.0 +; GFX11-NEXT: s_mov_b32 s8, 0x40400000 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 +; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13 ; GFX11-NEXT: v_mov_b32_e32 v6, s12 +; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v9, 0xb36211c7 ; GFX11-NEXT: v_bfrev_b32_e32 v10, 4.0 -; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v3, s8 -; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13 +; GFX11-NEXT: v_mov_b32_e32 v5, s10 +; GFX11-NEXT: v_mov_b32_e32 v3, s8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 ; GFX11-NEXT: s_mov_b32 s6, 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: flat_load_b32 v11, v[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -995,24 +999,24 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_mov_b32 s5, 1.0 -; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_mov_b32 s8, 0x42004600 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_mov_b32 s9, 0x44004700 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_mov_b32 s10, 0x45004800 -; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v6, 0xb36211c6 :: v_dual_mov_b32 v5, s10 ; GFX11-NEXT: v_bfrev_b32_e32 v7, 4.0 -; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9 +; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v3, s8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6 -; GFX11-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 ; GFX11-NEXT: s_mov_b32 s6, 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: flat_load_b32 v8, v[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll index cc21305a5a193..e28155d63a0b4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll @@ -24,7 +24,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[38:39], s[38:39] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a16, s16 ; GCN-NEXT: v_accvgpr_write_b32 a1, s1 ; GCN-NEXT: v_accvgpr_write_b32 a2, s2 ; GCN-NEXT: v_accvgpr_write_b32 a3, s3 @@ -40,6 +39,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # ; GCN-NEXT: v_accvgpr_write_b32 a13, s13 ; GCN-NEXT: v_accvgpr_write_b32 a14, s14 ; GCN-NEXT: v_accvgpr_write_b32 a15, s15 +; GCN-NEXT: v_accvgpr_write_b32 a16, s16 ; GCN-NEXT: v_accvgpr_write_b32 a17, s17 ; GCN-NEXT: v_accvgpr_write_b32 a18, s18 ; GCN-NEXT: v_accvgpr_write_b32 a19, s19 @@ -319,12 +319,12 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GCN-NEXT: v_accvgpr_write_b32 a0, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GCN-NEXT: v_accvgpr_write_b32 a2, s8 -; GCN-NEXT: v_accvgpr_write_b32 a4, s8 -; GCN-NEXT: v_accvgpr_write_b32 a6, s6 ; GCN-NEXT: v_accvgpr_write_b32 a1, s9 +; GCN-NEXT: v_accvgpr_write_b32 a2, s8 ; GCN-NEXT: v_accvgpr_write_b32 a3, s9 +; GCN-NEXT: v_accvgpr_write_b32 a4, s8 ; GCN-NEXT: v_accvgpr_write_b32 a5, s9 +; GCN-NEXT: v_accvgpr_write_b32 a6, s6 ; GCN-NEXT: v_accvgpr_write_b32 a7, s7 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1] ; GCN-NEXT: s_nop 1 @@ -351,12 +351,12 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GCN-NEXT: v_accvgpr_write_b32 a0, s6 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GCN-NEXT: v_accvgpr_write_b32 a2, s6 -; GCN-NEXT: v_accvgpr_write_b32 a4, s6 -; GCN-NEXT: v_accvgpr_write_b32 a6, s6 ; GCN-NEXT: v_accvgpr_write_b32 a1, s7 +; GCN-NEXT: v_accvgpr_write_b32 a2, s6 ; GCN-NEXT: v_accvgpr_write_b32 a3, s7 +; GCN-NEXT: v_accvgpr_write_b32 a4, s6 ; GCN-NEXT: v_accvgpr_write_b32 a5, s7 +; GCN-NEXT: v_accvgpr_write_b32 a6, s6 ; GCN-NEXT: v_accvgpr_write_b32 a7, s7 ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[8:9], s[8:9] op_sel:[0,1] ; GCN-NEXT: s_nop 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll index 90e2840f0d667..570a33fa6f753 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -12,9 +12,9 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) { ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -52,8 +52,9 @@ define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll index e5d9884e5ee29..28d1bd0eb3b63 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -10,10 +10,10 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll index b1de0eff05d30..3bcc1a6a3affd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -291,10 +291,10 @@ define amdgpu_ps void @test_s_load_constant_v8i32_align1(ptr addrspace(4) inreg ; GFX9-NEXT: v_readfirstlane_b32 s7, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s8, v4 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_readfirstlane_b32 s9, v5 ; GFX9-NEXT: v_readfirstlane_b32 s10, v6 ; GFX9-NEXT: v_readfirstlane_b32 s11, v7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 @@ -322,10 +322,10 @@ define amdgpu_ps void @test_s_load_constant_v8i32_align1(ptr addrspace(4) inreg ; GFX7-NEXT: v_readfirstlane_b32 s3, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s8, v4 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_readfirstlane_b32 s9, v5 ; GFX7-NEXT: v_readfirstlane_b32 s10, v6 ; GFX7-NEXT: v_readfirstlane_b32 s11, v7 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 @@ -383,9 +383,9 @@ define amdgpu_ps void @test_s_load_constant_v8i32_align1(ptr addrspace(4) inreg ; GFX11-NEXT: v_readfirstlane_b32 s11, v7 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s11 -; GFX11-NEXT: v_dual_mov_b32 v5, s9 :: v_dual_mov_b32 v6, s10 +; GFX11-NEXT: v_dual_mov_b32 v5, s9 :: v_dual_mov_b32 v4, s8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_mov_b32 v7, s11 :: v_dual_mov_b32 v6, s10 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3] ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[2:3] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll index 27005e7aa175e..7704463ca8006 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll @@ -406,12 +406,12 @@ define amdgpu_ps void @load_uniform_P1_v2i32(ptr addrspace(1) inreg %ptra, ptr a ; GFX7-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: v_readfirstlane_b32 s1, v3 -; GFX7-NEXT: v_readfirstlane_b32 s5, v5 ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 +; GFX7-NEXT: v_readfirstlane_b32 s1, v3 ; GFX7-NEXT: v_readfirstlane_b32 s4, v4 -; GFX7-NEXT: s_add_i32 s1, s1, s5 +; GFX7-NEXT: v_readfirstlane_b32 s5, v5 ; GFX7-NEXT: s_add_i32 s0, s0, s4 +; GFX7-NEXT: s_add_i32 s1, s1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 @@ -425,14 +425,14 @@ define amdgpu_ps void @load_uniform_P1_v2i32(ptr addrspace(1) inreg %ptra, ptr a ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] ; GFX11-NEXT: global_load_b64 v[4:5], v4, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s1, v3 -; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11-NEXT: s_add_i32 s1, s1, s3 +; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: s_add_i32 s0, s0, s2 +; GFX11-NEXT: s_add_i32 s1, s1, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; @@ -443,14 +443,14 @@ define amdgpu_ps void @load_uniform_P1_v2i32(ptr addrspace(1) inreg %ptra, ptr a ; GFX12-NEXT: global_load_b64 v[2:3], v4, s[0:1] ; GFX12-NEXT: global_load_b64 v[4:5], v4, s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s1, v3 -; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NEXT: v_readfirstlane_b32 s1, v3 ; GFX12-NEXT: v_readfirstlane_b32 s2, v4 -; GFX12-NEXT: s_add_co_i32 s1, s1, s3 +; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: s_add_co_i32 s0, s0, s2 +; GFX12-NEXT: s_add_co_i32 s1, s1, s3 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(1) %ptra, align 2 @@ -471,9 +471,9 @@ define amdgpu_ps void @load_uniform_P1_v3i32_gfx12(ptr addrspace(1) inreg %ptra, ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 -; GFX7-NEXT: v_readfirstlane_b32 s4, v5 ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 ; GFX7-NEXT: v_readfirstlane_b32 s6, v4 +; GFX7-NEXT: v_readfirstlane_b32 s4, v5 ; GFX7-NEXT: v_readfirstlane_b32 s5, v6 ; GFX7-NEXT: v_readfirstlane_b32 s7, v7 ; GFX7-NEXT: s_add_i32 s4, s0, s4 @@ -493,18 +493,17 @@ define amdgpu_ps void @load_uniform_P1_v3i32_gfx12(ptr addrspace(1) inreg %ptra, ; GFX11-NEXT: global_load_b96 v[2:4], v5, s[0:1] ; GFX11-NEXT: global_load_b96 v[5:7], v5, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11-NEXT: v_readfirstlane_b32 s5, v7 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-NEXT: v_readfirstlane_b32 s2, v4 ; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: v_readfirstlane_b32 s4, v6 -; GFX11-NEXT: s_add_i32 s2, s2, s5 +; GFX11-NEXT: v_readfirstlane_b32 s5, v7 ; GFX11-NEXT: s_add_i32 s0, s0, s3 ; GFX11-NEXT: s_add_i32 s1, s1, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 -; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: s_add_i32 s2, s2, s5 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v4, s2 ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: s_endpgm ; @@ -515,18 +514,17 @@ define amdgpu_ps void @load_uniform_P1_v3i32_gfx12(ptr addrspace(1) inreg %ptra, ; GFX12-NEXT: global_load_b96 v[2:4], v5, s[0:1] ; GFX12-NEXT: global_load_b96 v[5:7], v5, s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s2, v4 -; GFX12-NEXT: v_readfirstlane_b32 s5, v7 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12-NEXT: v_readfirstlane_b32 s2, v4 ; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: v_readfirstlane_b32 s4, v6 -; GFX12-NEXT: s_add_co_i32 s2, s2, s5 +; GFX12-NEXT: v_readfirstlane_b32 s5, v7 ; GFX12-NEXT: s_add_co_i32 s0, s0, s3 ; GFX12-NEXT: s_add_co_i32 s1, s1, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: s_add_co_i32 s2, s2, s5 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_mov_b32_e32 v4, s2 ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX12-NEXT: s_endpgm %a = load <3 x i32>, ptr addrspace(1) %ptra, align 2 @@ -547,10 +545,10 @@ define amdgpu_ps void @load_uniform_P1_v4i32(ptr addrspace(1) inreg %ptra, ptr a ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 -; GFX7-NEXT: v_readfirstlane_b32 s4, v6 ; GFX7-NEXT: v_readfirstlane_b32 s1, v3 ; GFX7-NEXT: v_readfirstlane_b32 s6, v4 ; GFX7-NEXT: v_readfirstlane_b32 s7, v5 +; GFX7-NEXT: v_readfirstlane_b32 s4, v6 ; GFX7-NEXT: v_readfirstlane_b32 s5, v7 ; GFX7-NEXT: v_readfirstlane_b32 s8, v8 ; GFX7-NEXT: v_readfirstlane_b32 s9, v9 @@ -573,21 +571,20 @@ define amdgpu_ps void @load_uniform_P1_v4i32(ptr addrspace(1) inreg %ptra, ptr a ; GFX11-NEXT: global_load_b128 v[2:5], v6, s[0:1] ; GFX11-NEXT: global_load_b128 v[6:9], v6, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s3, v5 -; GFX11-NEXT: v_readfirstlane_b32 s7, v9 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: v_readfirstlane_b32 s4, v6 ; GFX11-NEXT: v_readfirstlane_b32 s5, v7 ; GFX11-NEXT: v_readfirstlane_b32 s6, v8 -; GFX11-NEXT: s_add_i32 s3, s3, s7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v9 ; GFX11-NEXT: s_add_i32 s0, s0, s4 ; GFX11-NEXT: s_add_i32 s1, s1, s5 ; GFX11-NEXT: s_add_i32 s2, s2, s6 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_add_i32 s3, s3, s7 +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: s_endpgm ; @@ -598,21 +595,20 @@ define amdgpu_ps void @load_uniform_P1_v4i32(ptr addrspace(1) inreg %ptra, ptr a ; GFX12-NEXT: global_load_b128 v[2:5], v6, s[0:1] ; GFX12-NEXT: global_load_b128 v[6:9], v6, s[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s3, v5 -; GFX12-NEXT: v_readfirstlane_b32 s7, v9 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 ; GFX12-NEXT: v_readfirstlane_b32 s2, v4 +; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: v_readfirstlane_b32 s4, v6 ; GFX12-NEXT: v_readfirstlane_b32 s5, v7 ; GFX12-NEXT: v_readfirstlane_b32 s6, v8 -; GFX12-NEXT: s_add_co_i32 s3, s3, s7 +; GFX12-NEXT: v_readfirstlane_b32 s7, v9 ; GFX12-NEXT: s_add_co_i32 s0, s0, s4 ; GFX12-NEXT: s_add_co_i32 s1, s1, s5 ; GFX12-NEXT: s_add_co_i32 s2, s2, s6 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: s_add_co_i32 s3, s3, s7 +; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: s_endpgm %a = load <4 x i32>, ptr addrspace(1) %ptra, align 2 @@ -638,29 +634,29 @@ define amdgpu_ps void @load_uniform_P1_v8i32(ptr addrspace(1) inreg %ptra, ptr a ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_readfirstlane_b32 s4, v2 ; GFX7-NEXT: v_readfirstlane_b32 s5, v3 -; GFX7-NEXT: v_readfirstlane_b32 s12, v10 ; GFX7-NEXT: v_readfirstlane_b32 s6, v4 ; GFX7-NEXT: v_readfirstlane_b32 s7, v5 -; GFX7-NEXT: v_readfirstlane_b32 s8, v6 +; GFX7-NEXT: v_readfirstlane_b32 s12, v10 ; GFX7-NEXT: v_readfirstlane_b32 s13, v11 ; GFX7-NEXT: v_readfirstlane_b32 s14, v12 ; GFX7-NEXT: v_readfirstlane_b32 s15, v13 -; GFX7-NEXT: v_readfirstlane_b32 s16, v14 -; GFX7-NEXT: s_add_i32 s4, s4, s12 +; GFX7-NEXT: v_readfirstlane_b32 s8, v6 ; GFX7-NEXT: v_readfirstlane_b32 s9, v7 ; GFX7-NEXT: v_readfirstlane_b32 s10, v8 ; GFX7-NEXT: v_readfirstlane_b32 s11, v9 +; GFX7-NEXT: v_readfirstlane_b32 s16, v14 ; GFX7-NEXT: v_readfirstlane_b32 s17, v15 ; GFX7-NEXT: v_readfirstlane_b32 s18, v16 ; GFX7-NEXT: v_readfirstlane_b32 s19, v17 +; GFX7-NEXT: s_add_i32 s4, s4, s12 ; GFX7-NEXT: s_add_i32 s5, s5, s13 ; GFX7-NEXT: s_add_i32 s6, s6, s14 ; GFX7-NEXT: s_add_i32 s7, s7, s15 ; GFX7-NEXT: s_add_i32 s8, s8, s16 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_add_i32 s9, s9, s17 ; GFX7-NEXT: s_add_i32 s10, s10, s18 ; GFX7-NEXT: s_add_i32 s11, s11, s19 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s7 @@ -682,30 +678,30 @@ define amdgpu_ps void @load_uniform_P1_v8i32(ptr addrspace(1) inreg %ptra, ptr a ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 -; GFX11-NEXT: v_readfirstlane_b32 s11, v13 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11-NEXT: v_readfirstlane_b32 s7, v9 +; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: v_readfirstlane_b32 s8, v10 ; GFX11-NEXT: v_readfirstlane_b32 s9, v11 ; GFX11-NEXT: v_readfirstlane_b32 s10, v12 -; GFX11-NEXT: v_readfirstlane_b32 s15, v17 +; GFX11-NEXT: v_readfirstlane_b32 s11, v13 ; GFX11-NEXT: v_readfirstlane_b32 s4, v6 ; GFX11-NEXT: v_readfirstlane_b32 s5, v7 ; GFX11-NEXT: v_readfirstlane_b32 s6, v8 +; GFX11-NEXT: v_readfirstlane_b32 s7, v9 ; GFX11-NEXT: v_readfirstlane_b32 s12, v14 ; GFX11-NEXT: v_readfirstlane_b32 s13, v15 ; GFX11-NEXT: v_readfirstlane_b32 s14, v16 -; GFX11-NEXT: s_add_i32 s3, s3, s11 +; GFX11-NEXT: v_readfirstlane_b32 s15, v17 ; GFX11-NEXT: s_add_i32 s0, s0, s8 ; GFX11-NEXT: s_add_i32 s1, s1, s9 ; GFX11-NEXT: s_add_i32 s2, s2, s10 -; GFX11-NEXT: s_add_i32 s7, s7, s15 +; GFX11-NEXT: s_add_i32 s3, s3, s11 ; GFX11-NEXT: s_add_i32 s4, s4, s12 ; GFX11-NEXT: s_add_i32 s5, s5, s13 ; GFX11-NEXT: s_add_i32 s6, s6, s14 +; GFX11-NEXT: s_add_i32 s7, s7, s15 ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 @@ -725,30 +721,30 @@ define amdgpu_ps void @load_uniform_P1_v8i32(ptr addrspace(1) inreg %ptra, ptr a ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 -; GFX12-NEXT: v_readfirstlane_b32 s11, v13 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 ; GFX12-NEXT: v_readfirstlane_b32 s2, v4 -; GFX12-NEXT: v_readfirstlane_b32 s7, v9 +; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: v_readfirstlane_b32 s8, v10 ; GFX12-NEXT: v_readfirstlane_b32 s9, v11 ; GFX12-NEXT: v_readfirstlane_b32 s10, v12 -; GFX12-NEXT: v_readfirstlane_b32 s15, v17 +; GFX12-NEXT: v_readfirstlane_b32 s11, v13 ; GFX12-NEXT: v_readfirstlane_b32 s4, v6 ; GFX12-NEXT: v_readfirstlane_b32 s5, v7 ; GFX12-NEXT: v_readfirstlane_b32 s6, v8 +; GFX12-NEXT: v_readfirstlane_b32 s7, v9 ; GFX12-NEXT: v_readfirstlane_b32 s12, v14 ; GFX12-NEXT: v_readfirstlane_b32 s13, v15 ; GFX12-NEXT: v_readfirstlane_b32 s14, v16 -; GFX12-NEXT: s_add_co_i32 s3, s3, s11 +; GFX12-NEXT: v_readfirstlane_b32 s15, v17 ; GFX12-NEXT: s_add_co_i32 s0, s0, s8 ; GFX12-NEXT: s_add_co_i32 s1, s1, s9 ; GFX12-NEXT: s_add_co_i32 s2, s2, s10 -; GFX12-NEXT: s_add_co_i32 s7, s7, s15 +; GFX12-NEXT: s_add_co_i32 s3, s3, s11 ; GFX12-NEXT: s_add_co_i32 s4, s4, s12 ; GFX12-NEXT: s_add_co_i32 s5, s5, s13 ; GFX12-NEXT: s_add_co_i32 s6, s6, s14 +; GFX12-NEXT: s_add_co_i32 s7, s7, s15 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 @@ -822,30 +818,30 @@ define amdgpu_ps void @load_uniform_P1_v16i32(ptr addrspace(1) inreg %ptra, ptr ; GFX7-NEXT: v_readfirstlane_b32 s36, v17 ; GFX7-NEXT: s_add_i32 s7, s7, s23 ; GFX7-NEXT: s_add_i32 s8, s8, s24 -; GFX7-NEXT: s_add_i32 s12, s12, s28 -; GFX7-NEXT: s_add_i32 s16, s16, s33 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_add_i32 s9, s9, s25 ; GFX7-NEXT: s_add_i32 s10, s10, s26 ; GFX7-NEXT: s_add_i32 s11, s11, s27 +; GFX7-NEXT: s_add_i32 s12, s12, s28 ; GFX7-NEXT: s_add_i32 s13, s13, s29 ; GFX7-NEXT: s_add_i32 s14, s14, s30 ; GFX7-NEXT: s_add_i32 s15, s15, s31 +; GFX7-NEXT: s_add_i32 s16, s16, s33 ; GFX7-NEXT: s_add_i32 s17, s17, s34 ; GFX7-NEXT: s_add_i32 s18, s18, s35 ; GFX7-NEXT: s_add_i32 s19, s19, s36 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-NEXT: v_mov_b32_e32 v6, s8 -; GFX7-NEXT: v_mov_b32_e32 v10, s12 -; GFX7-NEXT: v_mov_b32_e32 v14, s16 ; GFX7-NEXT: v_mov_b32_e32 v7, s9 ; GFX7-NEXT: v_mov_b32_e32 v8, s10 ; GFX7-NEXT: v_mov_b32_e32 v9, s11 +; GFX7-NEXT: v_mov_b32_e32 v10, s12 ; GFX7-NEXT: v_mov_b32_e32 v11, s13 ; GFX7-NEXT: v_mov_b32_e32 v12, s14 ; GFX7-NEXT: v_mov_b32_e32 v13, s15 +; GFX7-NEXT: v_mov_b32_e32 v14, s16 ; GFX7-NEXT: v_mov_b32_e32 v15, s17 ; GFX7-NEXT: v_mov_b32_e32 v16, s18 ; GFX7-NEXT: v_mov_b32_e32 v17, s19 @@ -871,57 +867,57 @@ define amdgpu_ps void @load_uniform_P1_v16i32(ptr addrspace(1) inreg %ptra, ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b128 v[30:33], v30, s[0:1] offset:48 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11-NEXT: v_readfirstlane_b32 s19, v21 -; GFX11-NEXT: v_readfirstlane_b32 s7, v9 +; GFX11-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-NEXT: v_readfirstlane_b32 s16, v18 ; GFX11-NEXT: v_readfirstlane_b32 s17, v19 ; GFX11-NEXT: v_readfirstlane_b32 s18, v20 -; GFX11-NEXT: v_readfirstlane_b32 s23, v25 +; GFX11-NEXT: v_readfirstlane_b32 s19, v21 ; GFX11-NEXT: v_readfirstlane_b32 s4, v6 ; GFX11-NEXT: v_readfirstlane_b32 s5, v7 ; GFX11-NEXT: v_readfirstlane_b32 s6, v8 -; GFX11-NEXT: v_readfirstlane_b32 s11, v13 +; GFX11-NEXT: v_readfirstlane_b32 s7, v9 ; GFX11-NEXT: v_readfirstlane_b32 s20, v22 ; GFX11-NEXT: v_readfirstlane_b32 s21, v23 ; GFX11-NEXT: v_readfirstlane_b32 s22, v24 -; GFX11-NEXT: v_readfirstlane_b32 s27, v29 +; GFX11-NEXT: v_readfirstlane_b32 s23, v25 ; GFX11-NEXT: v_readfirstlane_b32 s8, v10 ; GFX11-NEXT: v_readfirstlane_b32 s9, v11 ; GFX11-NEXT: v_readfirstlane_b32 s10, v12 -; GFX11-NEXT: v_readfirstlane_b32 s15, v17 +; GFX11-NEXT: v_readfirstlane_b32 s11, v13 ; GFX11-NEXT: v_readfirstlane_b32 s24, v26 ; GFX11-NEXT: v_readfirstlane_b32 s25, v27 ; GFX11-NEXT: v_readfirstlane_b32 s26, v28 -; GFX11-NEXT: v_readfirstlane_b32 s31, v33 +; GFX11-NEXT: v_readfirstlane_b32 s27, v29 ; GFX11-NEXT: v_readfirstlane_b32 s12, v14 ; GFX11-NEXT: v_readfirstlane_b32 s13, v15 ; GFX11-NEXT: v_readfirstlane_b32 s14, v16 +; GFX11-NEXT: v_readfirstlane_b32 s15, v17 ; GFX11-NEXT: v_readfirstlane_b32 s28, v30 ; GFX11-NEXT: v_readfirstlane_b32 s29, v31 ; GFX11-NEXT: v_readfirstlane_b32 s30, v32 -; GFX11-NEXT: s_add_i32 s3, s3, s19 +; GFX11-NEXT: v_readfirstlane_b32 s31, v33 ; GFX11-NEXT: s_add_i32 s0, s0, s16 ; GFX11-NEXT: s_add_i32 s1, s1, s17 ; GFX11-NEXT: s_add_i32 s2, s2, s18 -; GFX11-NEXT: s_add_i32 s7, s7, s23 +; GFX11-NEXT: s_add_i32 s3, s3, s19 ; GFX11-NEXT: s_add_i32 s4, s4, s20 ; GFX11-NEXT: s_add_i32 s5, s5, s21 ; GFX11-NEXT: s_add_i32 s6, s6, s22 -; GFX11-NEXT: s_add_i32 s11, s11, s27 -; GFX11-NEXT: v_mov_b32_e32 v5, s3 +; GFX11-NEXT: s_add_i32 s7, s7, s23 ; GFX11-NEXT: s_add_i32 s8, s8, s24 ; GFX11-NEXT: s_add_i32 s9, s9, s25 ; GFX11-NEXT: s_add_i32 s10, s10, s26 -; GFX11-NEXT: s_add_i32 s15, s15, s31 -; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 +; GFX11-NEXT: s_add_i32 s11, s11, s27 +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v9, s7 ; GFX11-NEXT: s_add_i32 s12, s12, s28 ; GFX11-NEXT: s_add_i32 s13, s13, s29 ; GFX11-NEXT: s_add_i32 s14, s14, s30 +; GFX11-NEXT: s_add_i32 s15, s15, s31 ; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 ; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 ; GFX11-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 @@ -951,57 +947,57 @@ define amdgpu_ps void @load_uniform_P1_v16i32(ptr addrspace(1) inreg %ptra, ptr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_load_b128 v[30:33], v30, s[0:1] offset:48 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3 ; GFX12-NEXT: v_readfirstlane_b32 s2, v4 -; GFX12-NEXT: v_readfirstlane_b32 s19, v21 -; GFX12-NEXT: v_readfirstlane_b32 s7, v9 +; GFX12-NEXT: v_readfirstlane_b32 s3, v5 ; GFX12-NEXT: v_readfirstlane_b32 s16, v18 ; GFX12-NEXT: v_readfirstlane_b32 s17, v19 ; GFX12-NEXT: v_readfirstlane_b32 s18, v20 -; GFX12-NEXT: v_readfirstlane_b32 s23, v25 +; GFX12-NEXT: v_readfirstlane_b32 s19, v21 ; GFX12-NEXT: v_readfirstlane_b32 s4, v6 ; GFX12-NEXT: v_readfirstlane_b32 s5, v7 ; GFX12-NEXT: v_readfirstlane_b32 s6, v8 -; GFX12-NEXT: v_readfirstlane_b32 s11, v13 +; GFX12-NEXT: v_readfirstlane_b32 s7, v9 ; GFX12-NEXT: v_readfirstlane_b32 s20, v22 ; GFX12-NEXT: v_readfirstlane_b32 s21, v23 ; GFX12-NEXT: v_readfirstlane_b32 s22, v24 -; GFX12-NEXT: v_readfirstlane_b32 s27, v29 +; GFX12-NEXT: v_readfirstlane_b32 s23, v25 ; GFX12-NEXT: v_readfirstlane_b32 s8, v10 ; GFX12-NEXT: v_readfirstlane_b32 s9, v11 ; GFX12-NEXT: v_readfirstlane_b32 s10, v12 -; GFX12-NEXT: v_readfirstlane_b32 s15, v17 +; GFX12-NEXT: v_readfirstlane_b32 s11, v13 ; GFX12-NEXT: v_readfirstlane_b32 s24, v26 ; GFX12-NEXT: v_readfirstlane_b32 s25, v27 ; GFX12-NEXT: v_readfirstlane_b32 s26, v28 -; GFX12-NEXT: v_readfirstlane_b32 s31, v33 +; GFX12-NEXT: v_readfirstlane_b32 s27, v29 ; GFX12-NEXT: v_readfirstlane_b32 s12, v14 ; GFX12-NEXT: v_readfirstlane_b32 s13, v15 ; GFX12-NEXT: v_readfirstlane_b32 s14, v16 +; GFX12-NEXT: v_readfirstlane_b32 s15, v17 ; GFX12-NEXT: v_readfirstlane_b32 s28, v30 ; GFX12-NEXT: v_readfirstlane_b32 s29, v31 ; GFX12-NEXT: v_readfirstlane_b32 s30, v32 -; GFX12-NEXT: s_add_co_i32 s3, s3, s19 +; GFX12-NEXT: v_readfirstlane_b32 s31, v33 ; GFX12-NEXT: s_add_co_i32 s0, s0, s16 ; GFX12-NEXT: s_add_co_i32 s1, s1, s17 ; GFX12-NEXT: s_add_co_i32 s2, s2, s18 -; GFX12-NEXT: s_add_co_i32 s7, s7, s23 +; GFX12-NEXT: s_add_co_i32 s3, s3, s19 ; GFX12-NEXT: s_add_co_i32 s4, s4, s20 ; GFX12-NEXT: s_add_co_i32 s5, s5, s21 ; GFX12-NEXT: s_add_co_i32 s6, s6, s22 -; GFX12-NEXT: s_add_co_i32 s11, s11, s27 -; GFX12-NEXT: v_mov_b32_e32 v5, s3 +; GFX12-NEXT: s_add_co_i32 s7, s7, s23 ; GFX12-NEXT: s_add_co_i32 s8, s8, s24 ; GFX12-NEXT: s_add_co_i32 s9, s9, s25 ; GFX12-NEXT: s_add_co_i32 s10, s10, s26 -; GFX12-NEXT: s_add_co_i32 s15, s15, s31 -; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 +; GFX12-NEXT: s_add_co_i32 s11, s11, s27 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_mov_b32_e32 v9, s7 ; GFX12-NEXT: s_add_co_i32 s12, s12, s28 ; GFX12-NEXT: s_add_co_i32 s13, s13, s29 ; GFX12-NEXT: s_add_co_i32 s14, s14, s30 +; GFX12-NEXT: s_add_co_i32 s15, s15, s31 ; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 ; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 ; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 @@ -1615,11 +1611,11 @@ define amdgpu_ps void @load_uniform_P4_v2i32(ptr addrspace(4) inreg %ptra, ptr a ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_readfirstlane_b32 s5, v3 ; GFX7-NEXT: v_readfirstlane_b32 s4, v2 +; GFX7-NEXT: v_readfirstlane_b32 s5, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_i32 s1, s5, s1 ; GFX7-NEXT: s_add_i32 s0, s4, s0 +; GFX7-NEXT: s_add_i32 s1, s5, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 @@ -1638,7 +1634,7 @@ define amdgpu_ps void @load_uniform_P4_v2i32(ptr addrspace(4) inreg %ptra, ptr a ; GFX11-NEXT: s_add_i32 s0, s2, s0 ; GFX11-NEXT: s_add_i32 s1, s3, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; @@ -1654,7 +1650,7 @@ define amdgpu_ps void @load_uniform_P4_v2i32(ptr addrspace(4) inreg %ptra, ptr a ; GFX12-NEXT: s_add_co_i32 s0, s2, s0 ; GFX12-NEXT: s_add_co_i32 s1, s3, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(4) %ptra, align 2 @@ -1693,16 +1689,15 @@ define amdgpu_ps void @load_uniform_P4_v3i32_gfx12(ptr addrspace(4) inreg %ptra, ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_load_b96 v[2:4], v2, s[0:1] ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s5, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s3, v2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v3 -; GFX11-NEXT: s_add_i32 s2, s5, s2 +; GFX11-NEXT: v_readfirstlane_b32 s5, v4 ; GFX11-NEXT: s_add_i32 s0, s3, s0 ; GFX11-NEXT: s_add_i32 s1, s4, s1 +; GFX11-NEXT: s_add_i32 s2, s5, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v4, s2 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: s_endpgm @@ -1713,15 +1708,15 @@ define amdgpu_ps void @load_uniform_P4_v3i32_gfx12(ptr addrspace(4) inreg %ptra, ; GFX12-NEXT: global_load_b96 v[2:4], v2, s[0:1] ; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s5, v4 ; GFX12-NEXT: v_readfirstlane_b32 s3, v2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v3 +; GFX12-NEXT: v_readfirstlane_b32 s5, v4 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s2, s5, s2 ; GFX12-NEXT: s_add_co_i32 s0, s3, s0 ; GFX12-NEXT: s_add_co_i32 s1, s4, s1 +; GFX12-NEXT: s_add_co_i32 s2, s5, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX12-NEXT: s_endpgm @@ -1765,18 +1760,17 @@ define amdgpu_ps void @load_uniform_P4_v4i32(ptr addrspace(4) inreg %ptra, ptr a ; GFX11-NEXT: global_load_b128 v[2:5], v2, s[0:1] ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s7, v5 ; GFX11-NEXT: v_readfirstlane_b32 s4, v2 ; GFX11-NEXT: v_readfirstlane_b32 s5, v3 ; GFX11-NEXT: v_readfirstlane_b32 s6, v4 +; GFX11-NEXT: v_readfirstlane_b32 s7, v5 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s3, s7, s3 ; GFX11-NEXT: s_add_i32 s0, s4, s0 ; GFX11-NEXT: s_add_i32 s1, s5, s1 ; GFX11-NEXT: s_add_i32 s2, s6, s2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_add_i32 s3, s7, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: s_endpgm ; @@ -1786,18 +1780,17 @@ define amdgpu_ps void @load_uniform_P4_v4i32(ptr addrspace(4) inreg %ptra, ptr a ; GFX12-NEXT: global_load_b128 v[2:5], v2, s[0:1] ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s7, v5 ; GFX12-NEXT: v_readfirstlane_b32 s4, v2 ; GFX12-NEXT: v_readfirstlane_b32 s5, v3 ; GFX12-NEXT: v_readfirstlane_b32 s6, v4 +; GFX12-NEXT: v_readfirstlane_b32 s7, v5 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s3, s7, s3 ; GFX12-NEXT: s_add_co_i32 s0, s4, s0 ; GFX12-NEXT: s_add_co_i32 s1, s5, s1 ; GFX12-NEXT: s_add_co_i32 s2, s6, s2 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: s_add_co_i32 s3, s7, s3 +; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: s_endpgm %a = load <4 x i32>, ptr addrspace(4) %ptra, align 2 @@ -1825,19 +1818,19 @@ define amdgpu_ps void @load_uniform_P4_v8i32(ptr addrspace(4) inreg %ptra, ptr a ; GFX7-NEXT: v_readfirstlane_b32 s15, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s16, v6 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_i32 s4, s12, s4 ; GFX7-NEXT: v_readfirstlane_b32 s17, v7 ; GFX7-NEXT: v_readfirstlane_b32 s18, v8 ; GFX7-NEXT: v_readfirstlane_b32 s19, v9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s12, s4 ; GFX7-NEXT: s_add_i32 s5, s13, s5 ; GFX7-NEXT: s_add_i32 s6, s14, s6 ; GFX7-NEXT: s_add_i32 s7, s15, s7 ; GFX7-NEXT: s_add_i32 s8, s16, s8 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_add_i32 s9, s17, s9 ; GFX7-NEXT: s_add_i32 s10, s18, s10 ; GFX7-NEXT: s_add_i32 s11, s19, s11 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s7 @@ -1857,24 +1850,24 @@ define amdgpu_ps void @load_uniform_P4_v8i32(ptr addrspace(4) inreg %ptra, ptr a ; GFX11-NEXT: global_load_b128 v[6:9], v6, s[0:1] offset:16 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_readfirstlane_b32 s11, v5 ; GFX11-NEXT: v_readfirstlane_b32 s8, v2 ; GFX11-NEXT: v_readfirstlane_b32 s9, v3 ; GFX11-NEXT: v_readfirstlane_b32 s10, v4 +; GFX11-NEXT: v_readfirstlane_b32 s11, v5 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s15, v9 ; GFX11-NEXT: v_readfirstlane_b32 s12, v6 ; GFX11-NEXT: v_readfirstlane_b32 s13, v7 ; GFX11-NEXT: v_readfirstlane_b32 s14, v8 +; GFX11-NEXT: v_readfirstlane_b32 s15, v9 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s3, s11, s3 ; GFX11-NEXT: s_add_i32 s0, s8, s0 ; GFX11-NEXT: s_add_i32 s1, s9, s1 ; GFX11-NEXT: s_add_i32 s2, s10, s2 -; GFX11-NEXT: s_add_i32 s7, s15, s7 +; GFX11-NEXT: s_add_i32 s3, s11, s3 ; GFX11-NEXT: s_add_i32 s4, s12, s4 ; GFX11-NEXT: s_add_i32 s5, s13, s5 ; GFX11-NEXT: s_add_i32 s6, s14, s6 +; GFX11-NEXT: s_add_i32 s7, s15, s7 ; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 @@ -1892,24 +1885,24 @@ define amdgpu_ps void @load_uniform_P4_v8i32(ptr addrspace(4) inreg %ptra, ptr a ; GFX12-NEXT: global_load_b128 v[6:9], v6, s[0:1] offset:16 ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x1 -; GFX12-NEXT: v_readfirstlane_b32 s11, v5 ; GFX12-NEXT: v_readfirstlane_b32 s8, v2 ; GFX12-NEXT: v_readfirstlane_b32 s9, v3 ; GFX12-NEXT: v_readfirstlane_b32 s10, v4 +; GFX12-NEXT: v_readfirstlane_b32 s11, v5 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s15, v9 ; GFX12-NEXT: v_readfirstlane_b32 s12, v6 ; GFX12-NEXT: v_readfirstlane_b32 s13, v7 ; GFX12-NEXT: v_readfirstlane_b32 s14, v8 +; GFX12-NEXT: v_readfirstlane_b32 s15, v9 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s3, s11, s3 ; GFX12-NEXT: s_add_co_i32 s0, s8, s0 ; GFX12-NEXT: s_add_co_i32 s1, s9, s1 ; GFX12-NEXT: s_add_co_i32 s2, s10, s2 -; GFX12-NEXT: s_add_co_i32 s7, s15, s7 +; GFX12-NEXT: s_add_co_i32 s3, s11, s3 ; GFX12-NEXT: s_add_co_i32 s4, s12, s4 ; GFX12-NEXT: s_add_co_i32 s5, s13, s5 ; GFX12-NEXT: s_add_co_i32 s6, s14, s6 +; GFX12-NEXT: s_add_co_i32 s7, s15, s7 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 @@ -1945,49 +1938,49 @@ define amdgpu_ps void @load_uniform_P4_v16i32(ptr addrspace(4) inreg %ptra, ptr ; GFX7-NEXT: v_readfirstlane_b32 s23, v5 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_readfirstlane_b32 s24, v6 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_readfirstlane_b32 s28, v10 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_readfirstlane_b32 s33, v14 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_i32 s4, s20, s4 ; GFX7-NEXT: v_readfirstlane_b32 s25, v7 ; GFX7-NEXT: v_readfirstlane_b32 s26, v8 ; GFX7-NEXT: v_readfirstlane_b32 s27, v9 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_readfirstlane_b32 s28, v10 ; GFX7-NEXT: v_readfirstlane_b32 s29, v11 ; GFX7-NEXT: v_readfirstlane_b32 s30, v12 ; GFX7-NEXT: v_readfirstlane_b32 s31, v13 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s33, v14 ; GFX7-NEXT: v_readfirstlane_b32 s34, v15 ; GFX7-NEXT: v_readfirstlane_b32 s35, v16 ; GFX7-NEXT: v_readfirstlane_b32 s36, v17 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s20, s4 ; GFX7-NEXT: s_add_i32 s5, s21, s5 ; GFX7-NEXT: s_add_i32 s6, s22, s6 ; GFX7-NEXT: s_add_i32 s7, s23, s7 ; GFX7-NEXT: s_add_i32 s8, s24, s8 -; GFX7-NEXT: s_add_i32 s12, s28, s12 -; GFX7-NEXT: s_add_i32 s16, s33, s16 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_add_i32 s9, s25, s9 ; GFX7-NEXT: s_add_i32 s10, s26, s10 ; GFX7-NEXT: s_add_i32 s11, s27, s11 +; GFX7-NEXT: s_add_i32 s12, s28, s12 ; GFX7-NEXT: s_add_i32 s13, s29, s13 ; GFX7-NEXT: s_add_i32 s14, s30, s14 ; GFX7-NEXT: s_add_i32 s15, s31, s15 +; GFX7-NEXT: s_add_i32 s16, s33, s16 ; GFX7-NEXT: s_add_i32 s17, s34, s17 ; GFX7-NEXT: s_add_i32 s18, s35, s18 ; GFX7-NEXT: s_add_i32 s19, s36, s19 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-NEXT: v_mov_b32_e32 v6, s8 -; GFX7-NEXT: v_mov_b32_e32 v10, s12 -; GFX7-NEXT: v_mov_b32_e32 v14, s16 ; GFX7-NEXT: v_mov_b32_e32 v7, s9 ; GFX7-NEXT: v_mov_b32_e32 v8, s10 ; GFX7-NEXT: v_mov_b32_e32 v9, s11 +; GFX7-NEXT: v_mov_b32_e32 v10, s12 ; GFX7-NEXT: v_mov_b32_e32 v11, s13 ; GFX7-NEXT: v_mov_b32_e32 v12, s14 ; GFX7-NEXT: v_mov_b32_e32 v13, s15 +; GFX7-NEXT: v_mov_b32_e32 v14, s16 ; GFX7-NEXT: v_mov_b32_e32 v15, s17 ; GFX7-NEXT: v_mov_b32_e32 v16, s18 ; GFX7-NEXT: v_mov_b32_e32 v17, s19 @@ -2007,45 +2000,45 @@ define amdgpu_ps void @load_uniform_P4_v16i32(ptr addrspace(4) inreg %ptra, ptr ; GFX11-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:48 ; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_readfirstlane_b32 s19, v5 ; GFX11-NEXT: v_readfirstlane_b32 s16, v2 ; GFX11-NEXT: v_readfirstlane_b32 s17, v3 ; GFX11-NEXT: v_readfirstlane_b32 s18, v4 +; GFX11-NEXT: v_readfirstlane_b32 s19, v5 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_readfirstlane_b32 s23, v9 ; GFX11-NEXT: v_readfirstlane_b32 s20, v6 ; GFX11-NEXT: v_readfirstlane_b32 s21, v7 ; GFX11-NEXT: v_readfirstlane_b32 s22, v8 +; GFX11-NEXT: v_readfirstlane_b32 s23, v9 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_readfirstlane_b32 s27, v13 ; GFX11-NEXT: v_readfirstlane_b32 s24, v10 ; GFX11-NEXT: v_readfirstlane_b32 s25, v11 ; GFX11-NEXT: v_readfirstlane_b32 s26, v12 +; GFX11-NEXT: v_readfirstlane_b32 s27, v13 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s31, v17 ; GFX11-NEXT: v_readfirstlane_b32 s28, v14 ; GFX11-NEXT: v_readfirstlane_b32 s29, v15 ; GFX11-NEXT: v_readfirstlane_b32 s30, v16 +; GFX11-NEXT: v_readfirstlane_b32 s31, v17 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s3, s19, s3 ; GFX11-NEXT: s_add_i32 s0, s16, s0 ; GFX11-NEXT: s_add_i32 s1, s17, s1 ; GFX11-NEXT: s_add_i32 s2, s18, s2 -; GFX11-NEXT: s_add_i32 s7, s23, s7 +; GFX11-NEXT: s_add_i32 s3, s19, s3 ; GFX11-NEXT: s_add_i32 s4, s20, s4 ; GFX11-NEXT: s_add_i32 s5, s21, s5 ; GFX11-NEXT: s_add_i32 s6, s22, s6 -; GFX11-NEXT: s_add_i32 s11, s27, s11 -; GFX11-NEXT: v_mov_b32_e32 v5, s3 +; GFX11-NEXT: s_add_i32 s7, s23, s7 ; GFX11-NEXT: s_add_i32 s8, s24, s8 ; GFX11-NEXT: s_add_i32 s9, s25, s9 ; GFX11-NEXT: s_add_i32 s10, s26, s10 -; GFX11-NEXT: s_add_i32 s15, s31, s15 -; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 +; GFX11-NEXT: s_add_i32 s11, s27, s11 +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v9, s7 ; GFX11-NEXT: s_add_i32 s12, s28, s12 ; GFX11-NEXT: s_add_i32 s13, s29, s13 ; GFX11-NEXT: s_add_i32 s14, s30, s14 +; GFX11-NEXT: s_add_i32 s15, s31, s15 ; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 ; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 ; GFX11-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 @@ -2069,45 +2062,45 @@ define amdgpu_ps void @load_uniform_P4_v16i32(ptr addrspace(4) inreg %ptra, ptr ; GFX12-NEXT: global_load_b128 v[14:17], v14, s[0:1] offset:48 ; GFX12-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x3 -; GFX12-NEXT: v_readfirstlane_b32 s19, v5 ; GFX12-NEXT: v_readfirstlane_b32 s16, v2 ; GFX12-NEXT: v_readfirstlane_b32 s17, v3 ; GFX12-NEXT: v_readfirstlane_b32 s18, v4 +; GFX12-NEXT: v_readfirstlane_b32 s19, v5 ; GFX12-NEXT: s_wait_loadcnt 0x2 -; GFX12-NEXT: v_readfirstlane_b32 s23, v9 ; GFX12-NEXT: v_readfirstlane_b32 s20, v6 ; GFX12-NEXT: v_readfirstlane_b32 s21, v7 ; GFX12-NEXT: v_readfirstlane_b32 s22, v8 +; GFX12-NEXT: v_readfirstlane_b32 s23, v9 ; GFX12-NEXT: s_wait_loadcnt 0x1 -; GFX12-NEXT: v_readfirstlane_b32 s27, v13 ; GFX12-NEXT: v_readfirstlane_b32 s24, v10 ; GFX12-NEXT: v_readfirstlane_b32 s25, v11 ; GFX12-NEXT: v_readfirstlane_b32 s26, v12 +; GFX12-NEXT: v_readfirstlane_b32 s27, v13 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_readfirstlane_b32 s31, v17 ; GFX12-NEXT: v_readfirstlane_b32 s28, v14 ; GFX12-NEXT: v_readfirstlane_b32 s29, v15 ; GFX12-NEXT: v_readfirstlane_b32 s30, v16 +; GFX12-NEXT: v_readfirstlane_b32 s31, v17 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_add_co_i32 s3, s19, s3 ; GFX12-NEXT: s_add_co_i32 s0, s16, s0 ; GFX12-NEXT: s_add_co_i32 s1, s17, s1 ; GFX12-NEXT: s_add_co_i32 s2, s18, s2 -; GFX12-NEXT: s_add_co_i32 s7, s23, s7 +; GFX12-NEXT: s_add_co_i32 s3, s19, s3 ; GFX12-NEXT: s_add_co_i32 s4, s20, s4 ; GFX12-NEXT: s_add_co_i32 s5, s21, s5 ; GFX12-NEXT: s_add_co_i32 s6, s22, s6 -; GFX12-NEXT: s_add_co_i32 s11, s27, s11 -; GFX12-NEXT: v_mov_b32_e32 v5, s3 +; GFX12-NEXT: s_add_co_i32 s7, s23, s7 ; GFX12-NEXT: s_add_co_i32 s8, s24, s8 ; GFX12-NEXT: s_add_co_i32 s9, s25, s9 ; GFX12-NEXT: s_add_co_i32 s10, s26, s10 -; GFX12-NEXT: s_add_co_i32 s15, s31, s15 -; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v9, s7 +; GFX12-NEXT: s_add_co_i32 s11, s27, s11 +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_mov_b32_e32 v9, s7 ; GFX12-NEXT: s_add_co_i32 s12, s28, s12 ; GFX12-NEXT: s_add_co_i32 s13, s29, s13 ; GFX12-NEXT: s_add_co_i32 s14, s30, s14 +; GFX12-NEXT: s_add_co_i32 s15, s31, s15 ; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v7, s5 ; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v13, s11 ; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s9 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll index b75eb737534e9..499118d03ba27 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -98,8 +98,9 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967296(ptr addrspace(1) in ; GFX12: ; %bb.0: ; GFX12-NEXT: s_add_co_u32 s0, s2, 0 ; GFX12-NEXT: s_add_co_ci_u32 s1, s3, 4 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296 @@ -136,8 +137,9 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967297(ptr addrspace(1) in ; GFX12: ; %bb.0: ; GFX12-NEXT: s_add_co_u32 s0, s2, 4 ; GFX12-NEXT: s_add_co_ci_u32 s1, s3, 4 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967297 @@ -313,12 +315,12 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_sgpr_offset(ptr addrspace(1) inreg % ; GFX6: ; %bb.0: ; GFX6-NEXT: s_ashr_i32 s5, s4, 31 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -327,12 +329,12 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_sgpr_offset(ptr addrspace(1) inreg % ; GFX7: ; %bb.0: ; GFX7-NEXT: s_ashr_i32 s5, s4, 31 ; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm @@ -911,11 +913,11 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(ptr addrspace(1) inreg % ; GFX6: ; %bb.0: ; GFX6-NEXT: s_ashr_i32 s5, s4, 31 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -925,11 +927,11 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(ptr addrspace(1) inreg % ; GFX7: ; %bb.0: ; GFX7-NEXT: s_ashr_i32 s5, s4, 31 ; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1508,8 +1510,8 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1) ; GFX12: ; %bb.0: ; GFX12-NEXT: s_add_co_u32 s0, s2, 0 ; GFX12-NEXT: s_add_co_ci_u32 s1, s3, 4 -; GFX12-NEXT: v_mov_b32_e32 v2, v0 -; GFX12-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 +; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s0 +; GFX12-NEXT: v_mov_b32_e32 v4, s1 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[3:4], v[1:2], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1644,9 +1646,9 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(ptr addrspace(1) inre ; ; GFX12-LABEL: mubuf_cmpxchg_sgpr_ptr_vgpr_offset: ; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: v_mov_b32_e32 v5, s3 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 1462b5965c0ab..f6645c3dc5dbd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -3247,8 +3247,8 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX8-NEXT: s_ashr_i32 s3, s2, 31 -; GFX8-NEXT: s_mulk_i32 s2, 0x50 ; GFX8-NEXT: s_mulk_i32 s3, 0x50 +; GFX8-NEXT: s_mulk_i32 s2, 0x50 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: s_add_u32 s3, s3, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll index e0581f01dda6a..6526d4f76d8ad 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll @@ -596,13 +596,13 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX7-NEXT: s_mov_b32 s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_or_b32 s4, s3, 0x50 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -614,11 +614,11 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_or_b32 s2, s2, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -629,10 +629,10 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_or_b32 s2, s2, 0x50 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -643,10 +643,10 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_or_b32 s2, s2, 0x50 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -657,9 +657,10 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_or_b32 s2, s2, 0x50 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -670,10 +671,12 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_or_b32 s2, s2, 0x50 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll index e450da73ab47d..6b29215a77006 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll @@ -570,8 +570,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; NEW_RBS-NEXT: s_ashr_i32 s1, s0, 31 ; NEW_RBS-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 ; NEW_RBS-NEXT: s_andn2_b32 s1, s5, exec_lo -; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3 ; NEW_RBS-NEXT: v_mov_b32_e32 v6, s2 +; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3 ; NEW_RBS-NEXT: s_and_b32 s5, exec_lo, exec_lo ; NEW_RBS-NEXT: s_or_b32 s5, s1, s5 ; NEW_RBS-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 @@ -583,8 +583,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; NEW_RBS-NEXT: s_cbranch_execz .LBB16_2 ; NEW_RBS-NEXT: ; %bb.4: ; %B ; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 -; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3 ; NEW_RBS-NEXT: v_mov_b32_e32 v6, s2 +; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3 ; NEW_RBS-NEXT: s_mov_b32 s6, exec_lo ; NEW_RBS-NEXT: v_add_co_u32 v6, vcc_lo, v4, v6 ; NEW_RBS-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v5, v7, vcc_lo @@ -595,8 +595,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; NEW_RBS-NEXT: s_cbranch_execz .LBB16_1 ; NEW_RBS-NEXT: ; %bb.5: ; %loop.body ; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 -; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3 ; NEW_RBS-NEXT: v_mov_b32_e32 v6, s2 +; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3 ; NEW_RBS-NEXT: s_add_i32 s2, s0, 1 ; NEW_RBS-NEXT: s_cmpk_lt_u32 s0, 0x64 ; NEW_RBS-NEXT: s_cselect_b32 s0, exec_lo, 0 @@ -604,8 +604,8 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; NEW_RBS-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo ; NEW_RBS-NEXT: s_andn2_b32 s3, s6, exec_lo ; NEW_RBS-NEXT: s_and_b32 s0, exec_lo, s0 -; NEW_RBS-NEXT: s_or_b32 s6, s3, s0 ; NEW_RBS-NEXT: global_load_dword v8, v[6:7], off +; NEW_RBS-NEXT: s_or_b32 s6, s3, s0 ; NEW_RBS-NEXT: s_mov_b32 s0, s2 ; NEW_RBS-NEXT: s_waitcnt vmcnt(0) ; NEW_RBS-NEXT: v_add_nc_u32_e32 v8, 1, v8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index 2f956d7a0a534..6369bb557c14b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -4180,8 +4180,8 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX6-NEXT: s_add_u32 s4, s0, s2 ; GFX6-NEXT: s_addc_u32 s3, s1, s3 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] @@ -4205,8 +4205,8 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX8-NEXT: s_add_u32 s4, s0, s2 ; GFX8-NEXT: s_addc_u32 s3, s1, s3 ; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] @@ -4230,8 +4230,8 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GFX9-NEXT: s_add_u32 s4, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s5, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 @@ -4558,8 +4558,8 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX6-LABEL: s_saddsat_i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s4, s0, s2 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_addc_u32 s5, s1, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 @@ -4579,8 +4579,8 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX8-LABEL: s_saddsat_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s4, s0, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_addc_u32 s5, s1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 @@ -4600,8 +4600,8 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX9-LABEL: s_saddsat_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s4, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s5, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 @@ -4921,23 +4921,23 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-LABEL: s_saddsat_v2i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s8, s0, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_addc_u32 s9, s1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 ; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 +; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 -; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX6-NEXT: s_add_u32 s0, s2, s6 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX6-NEXT: s_add_u32 s0, s2, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_addc_u32 s1, s3, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 @@ -4959,23 +4959,23 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-LABEL: s_saddsat_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s8, s0, s4 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_addc_u32 s9, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 ; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 +; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX8-NEXT: s_add_u32 s0, s2, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX8-NEXT: s_add_u32 s0, s2, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_addc_u32 s1, s3, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 @@ -4997,23 +4997,23 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-LABEL: s_saddsat_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s8, s0, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s9, s1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 ; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX9-NEXT: s_add_u32 s0, s2, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX9-NEXT: s_add_u32 s0, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_addc_u32 s1, s3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 @@ -5097,13 +5097,13 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-LABEL: s_saddsat_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s4, s0, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_addc_u32 s5, s1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_addc_u32 s8, s2, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] ; GFX6-NEXT: s_addc_u32 s9, s3, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[2:3] @@ -5139,12 +5139,12 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s4, s0, s4 ; GFX8-NEXT: s_addc_u32 s5, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_addc_u32 s8, s2, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_addc_u32 s9, s3, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[8:9], s[2:3] ; GFX8-NEXT: s_cselect_b32 s0, 1, 0 @@ -5186,12 +5186,12 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s4, s0, s4 ; GFX9-NEXT: s_addc_u32 s5, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s8, s2, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_addc_u32 s9, s3, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[8:9], s[2:3] ; GFX9-NEXT: s_cselect_b32 s0, 1, 0 @@ -5887,13 +5887,13 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-LABEL: s_saddsat_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s8, s0, s8 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_addc_u32 s9, s1, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_addc_u32 s16, s2, s10 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: s_addc_u32 s17, s3, s11 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[2:3] @@ -5917,16 +5917,16 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_mov_b32_e32 v0, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: s_add_u32 s0, s4, s12 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc -; GFX6-NEXT: s_add_u32 s0, s4, s12 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_addc_u32 s1, s5, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_addc_u32 s2, s6, s14 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: s_addc_u32 s3, s7, s15 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -5966,12 +5966,12 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s8, s0, s8 ; GFX8-NEXT: s_addc_u32 s9, s1, s9 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_addc_u32 s16, s2, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_addc_u32 s17, s3, s11 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] ; GFX8-NEXT: s_cselect_b32 s0, 1, 0 @@ -5990,27 +5990,27 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s0, s17, 31 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_add_u32 s0, s4, s12 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NEXT: v_mov_b32_e32 v3, s17 -; GFX8-NEXT: s_add_u32 s0, s4, s12 +; GFX8-NEXT: s_addc_u32 s1, s5, s13 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc -; GFX8-NEXT: s_addc_u32 s1, s5, s13 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_addc_u32 s2, s6, s14 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s3, s7, s15 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX8-NEXT: s_cselect_b32 s4, 1, 0 @@ -6056,12 +6056,12 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s8, s0, s8 ; GFX9-NEXT: s_addc_u32 s9, s1, s9 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s16, s2, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_addc_u32 s17, s3, s11 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] ; GFX9-NEXT: s_cselect_b32 s0, 1, 0 @@ -6080,27 +6080,27 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s0, s17, 31 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_add_u32 s0, s4, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: s_add_u32 s0, s4, s12 +; GFX9-NEXT: s_addc_u32 s1, s5, s13 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc -; GFX9-NEXT: s_addc_u32 s1, s5, s13 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_addc_u32 s2, s6, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_addc_u32 s3, s7, s15 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX9-NEXT: s_cselect_b32 s4, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 9d6ffc9bbc0dc..afbc14c03b6b9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -42,13 +42,13 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -642,6 +642,7 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX8-NEXT: s_sub_i32 s1, 0, s11 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_mul_lo_u32 v2, s0, v0 ; GFX8-NEXT: s_add_i32 s0, s8, s12 ; GFX8-NEXT: s_xor_b32 s0, s0, s12 @@ -688,13 +689,12 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: s_xor_b32 s0, s2, s10 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm @@ -878,6 +878,7 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX8-NEXT: s_sub_i32 s11, 0, s10 +; GFX8-NEXT: v_mov_b32_e32 v9, s5 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -982,15 +983,15 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s10, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc ; GFX8-NEXT: s_xor_b32 s0, s3, s2 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc ; GFX8-NEXT: v_xor_b32_e32 v3, s0, v7 -; GFX8-NEXT: v_xor_b32_e32 v7, s3, v8 -; GFX8-NEXT: v_mov_b32_e32 v9, s5 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s0, v3 +; GFX8-NEXT: v_xor_b32_e32 v7, s3, v8 ; GFX8-NEXT: v_mov_b32_e32 v8, s4 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s3, v7 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] @@ -2234,13 +2235,13 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_byte v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -2417,12 +2418,12 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_short v[0:1], v4 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v3 @@ -2649,13 +2650,13 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -3061,15 +3062,15 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -3215,15 +3216,15 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s5, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s7, v3 -; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll index 256d6d9a16fa9..ec77987a33527 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -257,8 +257,8 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc ; GFX8-NEXT: flat_store_dword v[2:3], v1 @@ -272,8 +272,8 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc ; GFX9-NEXT: global_store_dword v[2:3], v1, off @@ -287,8 +287,8 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) { ; GFX10-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo ; GFX10-NEXT: global_store_dword v[2:3], v1, off @@ -347,9 +347,9 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_dword v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index c1b225562b77b..cc8c6f950ec8e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -4185,8 +4185,8 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX6-NEXT: s_sub_u32 s4, s0, s2 ; GFX6-NEXT: s_subb_u32 s3, s1, s3 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] @@ -4210,8 +4210,8 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX8-NEXT: s_sub_u32 s4, s0, s2 ; GFX8-NEXT: s_subb_u32 s3, s1, s3 ; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] @@ -4235,8 +4235,8 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GFX9-NEXT: s_sub_u32 s4, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s5, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 @@ -4563,8 +4563,8 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX6-LABEL: s_ssubsat_i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sub_u32 s4, s0, s2 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_subb_u32 s5, s1, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 @@ -4584,8 +4584,8 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX8-LABEL: s_ssubsat_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s4, s0, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_subb_u32 s5, s1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 @@ -4605,8 +4605,8 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX9-LABEL: s_ssubsat_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s4, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s5, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 @@ -4926,23 +4926,23 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-LABEL: s_ssubsat_v2i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sub_u32 s8, s0, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_subb_u32 s9, s1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 ; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 +; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 -; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX6-NEXT: s_sub_u32 s0, s2, s6 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX6-NEXT: s_sub_u32 s0, s2, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_subb_u32 s1, s3, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 @@ -4964,23 +4964,23 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-LABEL: s_ssubsat_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s8, s0, s4 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_subb_u32 s9, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 ; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 +; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX8-NEXT: s_sub_u32 s0, s2, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX8-NEXT: s_sub_u32 s0, s2, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_subb_u32 s1, s3, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 @@ -5002,23 +5002,23 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-LABEL: s_ssubsat_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s8, s0, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s9, s1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 ; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc +; GFX9-NEXT: s_sub_u32 s0, s2, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GFX9-NEXT: s_sub_u32 s0, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_subb_u32 s1, s3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 @@ -5102,13 +5102,13 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-LABEL: s_ssubsat_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sub_u32 s8, s0, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_subb_u32 s9, s1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_subb_u32 s10, s2, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: s_subb_u32 s11, s3, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3] @@ -5146,12 +5146,12 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s8, s0, s4 ; GFX8-NEXT: s_subb_u32 s9, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_subb_u32 s10, s2, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_subb_u32 s11, s3, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] ; GFX8-NEXT: s_cselect_b32 s0, 1, 0 @@ -5195,12 +5195,12 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s8, s0, s4 ; GFX9-NEXT: s_subb_u32 s9, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s10, s2, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_subb_u32 s11, s3, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] ; GFX9-NEXT: s_cselect_b32 s0, 1, 0 @@ -5940,13 +5940,13 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-LABEL: s_ssubsat_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sub_u32 s16, s0, s8 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_subb_u32 s17, s1, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_subb_u32 s18, s2, s10 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] ; GFX6-NEXT: s_subb_u32 s19, s3, s11 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3] @@ -5972,16 +5972,16 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_mov_b32_e32 v0, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s18 ; GFX6-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NEXT: s_sub_u32 s0, s4, s12 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc -; GFX6-NEXT: s_sub_u32 s0, s4, s12 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_subb_u32 s1, s5, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_subb_u32 s2, s6, s14 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: s_subb_u32 s3, s7, s15 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -6023,12 +6023,12 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s16, s0, s8 ; GFX8-NEXT: s_subb_u32 s17, s1, s9 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_subb_u32 s18, s2, s10 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_subb_u32 s19, s3, s11 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] ; GFX8-NEXT: s_cselect_b32 s0, 1, 0 @@ -6049,27 +6049,27 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s0, s19, 31 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NEXT: v_mov_b32_e32 v3, s17 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_sub_u32 s0, s4, s12 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v3, s19 -; GFX8-NEXT: s_sub_u32 s0, s4, s12 +; GFX8-NEXT: s_subb_u32 s1, s5, s13 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc -; GFX8-NEXT: s_subb_u32 s1, s5, s13 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_subb_u32 s2, s6, s14 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_subb_u32 s3, s7, s15 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX8-NEXT: s_cselect_b32 s4, 1, 0 @@ -6117,12 +6117,12 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s16, s0, s8 ; GFX9-NEXT: s_subb_u32 s17, s1, s9 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s18, s2, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_subb_u32 s19, s3, s11 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] ; GFX9-NEXT: s_cselect_b32 s0, 1, 0 @@ -6143,27 +6143,27 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s0, s19, 31 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_sub_u32 s0, s4, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: s_sub_u32 s0, s4, s12 +; GFX9-NEXT: s_subb_u32 s1, s5, s13 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc -; GFX9-NEXT: s_subb_u32 s1, s5, s13 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_subb_u32 s2, s6, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_subb_u32 s3, s7, s15 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX9-NEXT: s_cselect_b32 s4, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll index e2fb704599250..b65e92eb9ec26 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll @@ -363,9 +363,9 @@ define void @s_usubo_usube(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, pt ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_sub_u32 s4, s16, s18 ; GFX7-NEXT: s_subb_u32 s5, s17, s19 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_cselect_b32 s8, 1, 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 @@ -380,8 +380,8 @@ define void @s_usubo_usube(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, pt ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_sub_u32 s4, s16, s18 ; GFX9-NEXT: s_subb_u32 s5, s17, s19 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_cselect_b32 s6, 1, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX9-NEXT: v_mov_b32_e32 v0, s6 @@ -394,8 +394,8 @@ define void @s_usubo_usube(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, pt ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_sub_u32 s4, s16, s18 ; GFX8-NEXT: s_subb_u32 s5, s17, s19 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll index 017575b92143b..480ea7436c4a7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll @@ -676,8 +676,8 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) { ; GFX7-LABEL: s_ssubo_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_sub_u32 s4, s0, s2 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_subb_u32 s5, s1, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX7-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 @@ -693,8 +693,8 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) { ; GFX8-LABEL: s_ssubo_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sub_u32 s4, s0, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_subb_u32 s5, s1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 @@ -710,8 +710,8 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) { ; GFX9-LABEL: s_ssubo_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s4, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s5, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index c50b491bcb074..191b8dadea991 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -3,6 +3,7 @@ ; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s + define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) { ; GFX8-LABEL: udivrem_i32: ; GFX8: ; %bb.0: @@ -35,9 +36,9 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -238,16 +239,16 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v0, vcc ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v13, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v14, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v15, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -536,6 +537,7 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GFX8-NEXT: s_sub_i32 s0, 0, s10 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -576,7 +578,6 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -706,6 +707,7 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v6, s18 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, s5 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 @@ -784,7 +786,6 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s19, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX8-NEXT: v_mov_b32_e32 v9, s5 ; GFX8-NEXT: v_mov_b32_e32 v8, s4 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -1243,16 +1244,16 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s14, v9 ; GFX8-NEXT: v_subbrev_u32_e64 v0, s[0:1], 0, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v11, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v12, v14, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v9, v1, v0, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v11, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v12, v14, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -1813,9 +1814,9 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_byte v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -2128,9 +2129,9 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -2440,11 +2441,11 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -2561,11 +2562,11 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 -; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll index 2d3ce9469ee90..dce7410b0bd88 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll @@ -345,10 +345,10 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: s_lshr_b32 s2, s2, 16 ; GFX8-NEXT: s_add_u32 s0, s0, 2 -; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll index 6b749df71223f..a19ebfd56c66d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll @@ -20,15 +20,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s0 ; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6 +; GFX12-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v17, s7 ; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4 ; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 ; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 @@ -62,15 +62,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s0 ; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6 +; GFX12-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v17, s7 ; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4 ; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 ; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 @@ -102,10 +102,10 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x42004200 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v13, s3 ; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13] @@ -122,10 +122,10 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v13, s3 ; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] @@ -142,10 +142,10 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v13, s3 ; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] @@ -176,15 +176,15 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> % ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_movk_i32 s0, 0x80 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s0 ; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 ; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 ; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 ; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 @@ -218,15 +218,15 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_movk_i32 s0, 0x80 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s0 ; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6 +; GFX12-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v11, s7 ; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 ; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2 ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 @@ -260,15 +260,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i3 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s0 ; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 ; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 ; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 ; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 @@ -302,15 +302,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i3 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s0 ; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 ; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 ; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 ; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 @@ -344,15 +344,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i3 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s0 ; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 ; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 ; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 ; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 @@ -386,15 +386,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i3 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s0 ; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 ; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 ; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 ; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 @@ -428,15 +428,15 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> % ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_movk_i32 s0, 0x80 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s4, s0 ; GFX12-NEXT: s_mov_b32 s5, s0 ; GFX12-NEXT: s_mov_b32 s6, s0 +; GFX12-NEXT: s_mov_b32 s7, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 +; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v13, s7 ; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 ; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 ; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll index 5344ab8da1ade..47077e025a90b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll @@ -18,11 +18,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v9, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: v_mov_b32_e32 v8, s2 +; GFX12-NEXT: v_mov_b32_e32 v9, s3 ; GFX12-NEXT: v_mov_b32_e32 v7, s1 ; GFX12-NEXT: v_mov_b32_e32 v6, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -52,11 +52,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v9, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: v_mov_b32_e32 v8, s2 +; GFX12-NEXT: v_mov_b32_e32 v9, s3 ; GFX12-NEXT: v_mov_b32_e32 v7, s1 ; GFX12-NEXT: v_mov_b32_e32 v6, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -85,11 +85,10 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x42004200 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s1 ; GFX12-NEXT: v_mov_b32_e32 v6, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v7, s1 ; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7] ; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off ; GFX12-NEXT: s_endpgm @@ -103,11 +102,10 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> ; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s1 ; GFX12-NEXT: v_mov_b32_e32 v6, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v7, s1 ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] ; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off ; GFX12-NEXT: s_endpgm @@ -121,11 +119,10 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> ; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: s_mov_b32 s1, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s1 ; GFX12-NEXT: v_mov_b32_e32 v6, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v7, s1 ; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7] ; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off ; GFX12-NEXT: s_endpgm @@ -152,11 +149,11 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_movk_i32 s0, 0x80 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 ; GFX12-NEXT: v_mov_b32_e32 v5, s1 ; GFX12-NEXT: v_mov_b32_e32 v4, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -186,11 +183,11 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_movk_i32 s0, 0x80 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 ; GFX12-NEXT: v_mov_b32_e32 v5, s1 ; GFX12-NEXT: v_mov_b32_e32 v4, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -220,11 +217,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 ; GFX12-NEXT: v_mov_b32_e32 v5, s1 ; GFX12-NEXT: v_mov_b32_e32 v4, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -254,11 +251,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 ; GFX12-NEXT: v_mov_b32_e32 v5, s1 ; GFX12-NEXT: v_mov_b32_e32 v4, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -288,11 +285,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 ; GFX12-NEXT: v_mov_b32_e32 v5, s1 ; GFX12-NEXT: v_mov_b32_e32 v4, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -322,11 +319,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_mov_b32 s0, 0x40400000 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 ; GFX12-NEXT: v_mov_b32_e32 v5, s1 ; GFX12-NEXT: v_mov_b32_e32 v4, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -356,11 +353,11 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_movk_i32 s0, 0x80 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 -; GFX12-NEXT: v_mov_b32_e32 v7, s3 +; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_mov_b32_e32 v7, s3 ; GFX12-NEXT: v_mov_b32_e32 v5, s1 ; GFX12-NEXT: v_mov_b32_e32 v4, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll index e882769f97ac1..4947594e414e2 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll @@ -544,11 +544,11 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__a(ptr %ptr) #0 { ; CHECK-NEXT: ; def a[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v5, a3 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a2 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[4:5] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v2, a4 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a2 ; CHECK-NEXT: v_accvgpr_read_b32 v3, a5 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 @@ -614,11 +614,11 @@ define void @flat_atomic_cmpxchg_i64_ret_a_a__v(ptr %ptr) #0 { ; CHECK-NEXT: ; def a[0:1] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v7, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: v_accvgpr_read_b32 v6, a2 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll index b8962fa29e8f1..e7b959bb50550 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll @@ -337,7 +337,6 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xchg_i32_ret_av_av_no_agprs: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse @@ -354,6 +353,7 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] @@ -464,7 +464,6 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse @@ -481,6 +480,7 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] @@ -493,12 +493,12 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a2 ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 @@ -2095,10 +2095,17 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a32 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a33 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: flat_load_dword v1, v[2:3] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a34 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 @@ -2127,13 +2134,6 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a32 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a33 -; GFX90A-NEXT: flat_load_dword v1, v[2:3] -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a34 -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a34 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2243,10 +2243,13 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: ; def a34 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a32 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a33 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: flat_load_dword v1, v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX950-NEXT: v_accvgpr_write_b32 a5, v5 ; GFX950-NEXT: v_accvgpr_write_b32 a6, v6 @@ -2275,9 +2278,6 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX950-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX950-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a32 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a33 -; GFX950-NEXT: flat_load_dword v1, v[2:3] ; GFX950-NEXT: v_accvgpr_read_b32 v4, a34 ; GFX950-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2595,9 +2595,9 @@ define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB32_6: ; %atomicrmw.phi @@ -2841,9 +2841,9 @@ define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB34_6: ; %atomicrmw.phi @@ -3200,9 +3200,9 @@ define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7 ; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB37_6: ; %atomicrmw.phi @@ -4010,7 +4010,6 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xor_i32_ret_av_av_no_agprs: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse @@ -4027,6 +4026,7 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] @@ -4135,7 +4135,6 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_xor_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse @@ -4152,6 +4151,7 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] @@ -4164,12 +4164,12 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a2 ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX950-NEXT: buffer_wbl2 sc1 @@ -4382,9 +4382,9 @@ define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3 ; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB53_4: ; %atomicrmw.phi @@ -4568,9 +4568,9 @@ define void @flat_atomic_xor_i64_ret_v_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3 ; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB55_4: ; %atomicrmw.phi @@ -4837,9 +4837,9 @@ define void @flat_atomic_xor_i64_ret_av_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3 ; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB58_4: ; %atomicrmw.phi @@ -6384,8 +6384,8 @@ define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB89_4: ; %atomicrmw.phi @@ -6568,9 +6568,9 @@ define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB91_4: ; %atomicrmw.phi @@ -6754,9 +6754,9 @@ define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_and_b32_e32 v3, v1, v3 ; GFX950-NEXT: v_and_b32_e32 v2, v0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB93_4: ; %atomicrmw.phi @@ -6975,9 +6975,9 @@ define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_and_b32_e32 v2, v1, v7 ; GFX950-NEXT: v_and_b32_e32 v5, v0, v6 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_not_b32_e32 v3, v2 ; GFX950-NEXT: v_not_b32_e32 v2, v5 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB95_6: ; %atomicrmw.phi @@ -7197,9 +7197,9 @@ define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX950-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB97_4: ; %atomicrmw.phi @@ -7384,10 +7384,10 @@ define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB99_4: ; %atomicrmw.phi @@ -7575,10 +7575,10 @@ define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB101_4: ; %atomicrmw.phi @@ -7766,10 +7766,10 @@ define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB103_4: ; %atomicrmw.phi @@ -7957,10 +7957,10 @@ define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-NEXT: .LBB105_4: ; %atomicrmw.phi @@ -8351,9 +8351,9 @@ define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1] ; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1 ; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off ; GFX950-NEXT: .LBB109_4: ; %atomicrmw.phi @@ -10108,8 +10108,9 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(1) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen @@ -10167,8 +10168,8 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB129_6: ; %atomicrmw.phi @@ -10328,9 +10329,10 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen @@ -10377,8 +10379,8 @@ define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB131_4: ; %atomicrmw.phi @@ -10516,9 +10518,10 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen @@ -10565,8 +10568,8 @@ define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB133_4: ; %atomicrmw.phi @@ -10721,10 +10724,11 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc @@ -10988,10 +10992,11 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc @@ -14505,12 +14510,12 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB195_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -14552,12 +14557,12 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB195_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -14576,8 +14581,8 @@ define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB195_4: ; %atomicrmw.end @@ -14682,12 +14687,12 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB197_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -14729,12 +14734,12 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB197_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -14754,9 +14759,9 @@ define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB197_4: ; %atomicrmw.end @@ -14863,12 +14868,12 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB199_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -14910,12 +14915,12 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB199_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -14934,9 +14939,9 @@ define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_and_b32_e32 v1, v3, v1 ; GFX950-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB199_4: ; %atomicrmw.end @@ -15042,12 +15047,12 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_cbranch_vccz .LBB201_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15106,12 +15111,12 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB201_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15147,9 +15152,9 @@ define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_and_b32_e32 v2, v1, v5 ; GFX950-NEXT: v_and_b32_e32 v4, v0, v4 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_not_b32_e32 v3, v2 ; GFX950-NEXT: v_not_b32_e32 v2, v4 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB201_6: ; %atomicrmw.phi @@ -15292,12 +15297,12 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB203_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15339,12 +15344,12 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB203_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15363,9 +15368,9 @@ define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX950-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB203_4: ; %atomicrmw.end @@ -15471,12 +15476,12 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB205_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15518,12 +15523,12 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB205_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15542,9 +15547,9 @@ define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_xor_b32_e32 v1, v3, v1 ; GFX950-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB205_4: ; %atomicrmw.end @@ -15650,12 +15655,12 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB207_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15698,12 +15703,12 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB207_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15723,10 +15728,10 @@ define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB207_4: ; %atomicrmw.end @@ -15835,12 +15840,12 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB209_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15883,12 +15888,12 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB209_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -15908,10 +15913,10 @@ define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB209_4: ; %atomicrmw.end @@ -16020,12 +16025,12 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB211_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -16068,12 +16073,12 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB211_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -16093,10 +16098,10 @@ define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB211_4: ; %atomicrmw.end @@ -16205,12 +16210,12 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB213_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -16253,12 +16258,12 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB213_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -16278,10 +16283,10 @@ define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB213_4: ; %atomicrmw.end @@ -16390,12 +16395,12 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB215_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -16440,12 +16445,12 @@ define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB215_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -16580,12 +16585,12 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB217_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -16632,12 +16637,12 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB217_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -16660,9 +16665,9 @@ define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1] ; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1 ; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s2 ; GFX950-NEXT: .LBB217_4: ; %atomicrmw.end @@ -16777,12 +16782,12 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_cbranch_vccz .LBB219_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -16843,12 +16848,12 @@ define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB219_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -17042,12 +17047,12 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_cbranch_vccz .LBB221_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -17106,12 +17111,12 @@ define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB221_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -18145,12 +18150,12 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_shared_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB235_3 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private @@ -18205,12 +18210,12 @@ define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB235_3 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private @@ -18383,12 +18388,12 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_cbranch_vccz .LBB237_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -18420,8 +18425,9 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: v_mov_b32_e32 v6, s4 ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 -; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(1) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen @@ -18440,12 +18446,12 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB237_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -18476,8 +18482,8 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB237_6: ; %atomicrmw.phi @@ -18607,12 +18613,12 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB239_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -18633,9 +18639,10 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen @@ -18654,12 +18661,12 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB239_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -18680,8 +18687,8 @@ define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB239_4: ; %atomicrmw.end @@ -18789,12 +18796,12 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: s_cbranch_vccz .LBB241_2 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -18815,9 +18822,10 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen @@ -18836,12 +18844,12 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: s_cbranch_vccz .LBB241_2 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -18862,8 +18870,8 @@ define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: .LBB241_4: ; %atomicrmw.end @@ -18971,12 +18979,12 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_cbranch_vccz .LBB243_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -19013,10 +19021,11 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc @@ -19036,12 +19045,12 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB243_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global @@ -19230,12 +19239,12 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX90A-NEXT: s_addc_u32 s5, s17, 0 ; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_cbranch_vccz .LBB245_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global @@ -19272,10 +19281,11 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc @@ -19295,12 +19305,12 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX950-NEXT: s_cbranch_vccz .LBB245_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll index b6fe0c756a106..668244a279dee 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll @@ -337,7 +337,6 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX90A-LABEL: global_atomic_xchg_i32_ret_av_av_no_agprs: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse @@ -354,6 +353,7 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] @@ -464,7 +464,6 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX950-LABEL: global_atomic_xchg_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse @@ -481,6 +480,7 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] @@ -493,12 +493,12 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a2 ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 @@ -1600,10 +1600,17 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a32 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a33 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: global_load_dword v1, v[2:3], off +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a34 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 @@ -1632,13 +1639,6 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a32 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a33 -; GFX90A-NEXT: global_load_dword v1, v[2:3], off -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a34 -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a34 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1748,10 +1748,13 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX950-NEXT: ; def a34 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX950-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a32 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a33 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: global_load_dword v1, v[2:3], off +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX950-NEXT: v_accvgpr_write_b32 a5, v5 ; GFX950-NEXT: v_accvgpr_write_b32 a6, v6 @@ -1780,9 +1783,6 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX950-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX950-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX950-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a32 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a33 -; GFX950-NEXT: global_load_dword v1, v[2:3], off ; GFX950-NEXT: v_accvgpr_read_b32 v4, a34 ; GFX950-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2991,7 +2991,6 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX90A-LABEL: global_atomic_xor_i32_ret_av_av_no_agprs: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse @@ -3008,6 +3007,7 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse ; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] @@ -3116,7 +3116,6 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX950-LABEL: global_atomic_xor_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse @@ -3133,6 +3132,7 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] @@ -3145,12 +3145,12 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a2 ; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX950-NEXT: buffer_wbl2 sc1 @@ -10334,11 +10334,11 @@ define void @global_atomic_xchg_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-LABEL: global_atomic_xchg_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10352,10 +10352,11 @@ define void @global_atomic_xchg_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-LABEL: global_atomic_xchg_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -10412,11 +10413,11 @@ define void @global_atomic_add_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX90A-LABEL: global_atomic_add_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10430,10 +10431,11 @@ define void @global_atomic_add_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX950-LABEL: global_atomic_add_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -10490,11 +10492,11 @@ define void @global_atomic_sub_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX90A-LABEL: global_atomic_sub_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10508,10 +10510,11 @@ define void @global_atomic_sub_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX950-LABEL: global_atomic_sub_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -10568,11 +10571,11 @@ define void @global_atomic_and_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX90A-LABEL: global_atomic_and_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10586,10 +10589,11 @@ define void @global_atomic_and_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX950-LABEL: global_atomic_and_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -10788,11 +10792,11 @@ define void @global_atomic_or_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX90A-LABEL: global_atomic_or_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10806,10 +10810,11 @@ define void @global_atomic_or_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX950-LABEL: global_atomic_or_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -10866,11 +10871,11 @@ define void @global_atomic_xor_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX90A-LABEL: global_atomic_xor_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10884,10 +10889,11 @@ define void @global_atomic_xor_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX950-LABEL: global_atomic_xor_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -10944,11 +10950,11 @@ define void @global_atomic_max_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX90A-LABEL: global_atomic_max_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -10962,10 +10968,11 @@ define void @global_atomic_max_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX950-LABEL: global_atomic_max_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -11022,11 +11029,11 @@ define void @global_atomic_min_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX90A-LABEL: global_atomic_min_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11040,10 +11047,11 @@ define void @global_atomic_min_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 ; GFX950-LABEL: global_atomic_min_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -11100,11 +11108,11 @@ define void @global_atomic_umax_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-LABEL: global_atomic_umax_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11118,10 +11126,11 @@ define void @global_atomic_umax_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-LABEL: global_atomic_umax_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -11178,11 +11187,11 @@ define void @global_atomic_umin_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-LABEL: global_atomic_umin_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11196,10 +11205,11 @@ define void @global_atomic_umin_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-LABEL: global_atomic_umin_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -11256,11 +11266,11 @@ define void @global_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX90A-LABEL: global_atomic_uinc_wrap_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11274,10 +11284,11 @@ define void @global_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX950-LABEL: global_atomic_uinc_wrap_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -11334,11 +11345,11 @@ define void @global_atomic_udec_wrap_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX90A-LABEL: global_atomic_udec_wrap_i64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -11352,10 +11363,11 @@ define void @global_atomic_udec_wrap_i64_saddr_ret_a_a(ptr addrspace(1) inreg %p ; GFX950-LABEL: global_atomic_udec_wrap_i64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -12449,11 +12461,11 @@ define void @global_atomic_fadd_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-LABEL: global_atomic_fadd_f64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -12467,10 +12479,11 @@ define void @global_atomic_fadd_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-LABEL: global_atomic_fadd_f64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_add_f64 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -12657,11 +12670,11 @@ define void @global_atomic_fmax_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-LABEL: global_atomic_fmax_f64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -12675,10 +12688,11 @@ define void @global_atomic_fmax_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-LABEL: global_atomic_fmax_f64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 @@ -12735,11 +12749,11 @@ define void @global_atomic_fmin_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-LABEL: global_atomic_fmin_f64_saddr_ret_a_a: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -12753,10 +12767,11 @@ define void @global_atomic_fmin_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-LABEL: global_atomic_fmin_f64_saddr_ret_a_a: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a[0:1] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX950-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll index 3194581fa4213..c09bdcf97e0ab 100644 --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -245,11 +245,11 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) # ; FIXEDABI-LABEL: marked_kernel_use_other_sgpr: ; FIXEDABI: ; %bb.0: ; FIXEDABI-NEXT: s_add_i32 s6, s6, s11 -; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7 ; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7 ; FIXEDABI-NEXT: s_add_u32 s0, s4, 8 -; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc ; FIXEDABI-NEXT: s_addc_u32 s1, s5, 0 +; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) ; FIXEDABI-NEXT: v_mov_b32_e32 v0, s0 ; FIXEDABI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll index b8814b64735e6..2701fce7f90b8 100644 --- a/llvm/test/CodeGen/AMDGPU/add.ll +++ b/llvm/test/CodeGen/AMDGPU/add.ll @@ -344,9 +344,9 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s9 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 @@ -541,20 +541,20 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v0, s17 ; GFX8-NEXT: v_mov_b32_e32 v1, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s15 ; GFX8-NEXT: v_mov_b32_e32 v1, s14 ; GFX8-NEXT: v_mov_b32_e32 v2, s13 ; GFX8-NEXT: v_mov_b32_e32 v3, s12 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s3 @@ -1021,8 +1021,9 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s2, s4 ; GFX11-NEXT: s_addc_u32 s3, s3, s5 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -1033,8 +1034,9 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -1123,8 +1125,9 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s2, s4 ; GFX11-NEXT: s_addc_u32 s3, s3, s5 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -1137,8 +1140,9 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad ; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -1166,10 +1170,10 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: .LBB9_2: ; %if ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: .LBB9_3: ; %endif -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -1192,10 +1196,10 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: .LBB9_2: ; %if ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX8-NEXT: .LBB9_3: ; %endif -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm @@ -1218,9 +1222,9 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: .LBB9_2: ; %if ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[10:11], 0x0 ; GFX9-NEXT: .LBB9_3: ; %endif +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm @@ -1241,9 +1245,9 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX10-NEXT: .LBB9_2: ; %if ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[10:11], 0x0 ; GFX10-NEXT: .LBB9_3: ; %endif +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm @@ -1265,8 +1269,8 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX11-NEXT: .LBB9_3: ; %endif ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB9_4: @@ -1286,8 +1290,8 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 ; GFX12-NEXT: .LBB9_3: ; %endif ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm ; GFX12-NEXT: .LBB9_4: diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index ef7a13819a799..9a3a04a622086 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -568,19 +568,19 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 ; GFX908-NEXT: s_mov_b32 s13, s12 +; GFX908-NEXT: v_mov_b32_e32 v5, s13 +; GFX908-NEXT: v_mov_b32_e32 v7, s13 +; GFX908-NEXT: v_mov_b32_e32 v9, s13 +; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0 ; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] ; GFX908-NEXT: v_mov_b32_e32 v4, s12 +; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15] ; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v6 ; GFX908-NEXT: v_mov_b32_e32 v6, s12 ; GFX908-NEXT: v_mov_b32_e32 v8, s12 -; GFX908-NEXT: v_mov_b32_e32 v5, s13 -; GFX908-NEXT: v_mov_b32_e32 v7, s13 -; GFX908-NEXT: v_mov_b32_e32 v9, s13 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 -; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15] ; GFX908-NEXT: v_mov_b32_e32 v10, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s9, v2 diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir index 950382758ffbc..945a8faf8ebae 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir @@ -34,17 +34,17 @@ body: | ; GFX908-LABEL: name: no_free_vgprs_for_copy_a64_to_a64 ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1 - ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec, implicit-def $agpr2_agpr3 - ; GFX908-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 + ; GFX908-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec + ; GFX908-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3 ; ; GFX90A-LABEL: name: no_free_vgprs_for_copy_a64_to_a64 ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit-def $agpr2_agpr3, implicit $agpr0_agpr1 - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1 + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3 $agpr2_agpr3 = COPY $agpr0_agpr1 S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3 diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-reuse-writes.mir b/llvm/test/CodeGen/AMDGPU/agpr-copy-reuse-writes.mir index 1573903945a3e..603179b7063f7 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-reuse-writes.mir +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-reuse-writes.mir @@ -15,10 +15,10 @@ body: | ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec - ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec + ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec @@ -47,8 +47,8 @@ body: | ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit killed $agpr0_agpr1 ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1 ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1 - ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr2_agpr3, implicit $agpr0_agpr1 - ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $agpr0_agpr1, implicit $exec + ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0 $agpr0_agpr1 = IMPLICIT_DEF SI_SPILL_AV64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) @@ -66,13 +66,13 @@ body: | ; GFX908-LABEL: name: overlapping_agpr ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr1_agpr2_agpr3_agpr4 - ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr1_agpr2_agpr3_agpr4 $agpr1_agpr2_agpr3_agpr4 = COPY $agpr0_agpr1_agpr2_agpr3, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir b/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir index a9d31c1c45b0e..da8e368f5ac47 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir @@ -35,17 +35,17 @@ body: | ; GFX908-LABEL: name: no_free_vgprs_for_copy_s64_to_a64 ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8_sgpr9 ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: $vgpr63 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr8_sgpr9 - ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec, implicit-def $agpr2_agpr3 - ; GFX908-NEXT: $vgpr63 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9 + ; GFX908-NEXT: $vgpr63 = V_MOV_B32_e32 $sgpr8, implicit $exec + ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec + ; GFX908-NEXT: $vgpr63 = V_MOV_B32_e32 $sgpr9, implicit $exec ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3 ; ; GFX90A-LABEL: name: no_free_vgprs_for_copy_s64_to_a64 ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8_sgpr9 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit-def $agpr2_agpr3, implicit $sgpr8_sgpr9 - ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9 + ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec + ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3 $agpr2_agpr3 = COPY $sgpr8_sgpr9 S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3 diff --git a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll index 63b7b70548baf..3c9b08d26288d 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll @@ -124,8 +124,8 @@ define amdgpu_kernel void @test_call_empty() #0 { ; GFX90A-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX90A-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: ;;#ASMSTART @@ -167,14 +167,14 @@ define amdgpu_kernel void @test_call_empty() #0 { ; GFX908-NEXT: s_add_u32 s4, s4, func_empty@gotpcrel32@lo+4 ; GFX908-NEXT: s_addc_u32 s5, s5, func_empty@gotpcrel32@hi+12 ; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX908-NEXT: s_mov_b32 s14, s10 -; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX908-NEXT: s_mov_b32 s14, s10 +; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_mov_b64 s[0:1], s[20:21] -; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: ;;#ASMSTART @@ -259,8 +259,8 @@ define amdgpu_kernel void @test_call_areg4() #0 { ; GFX90A-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX90A-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: ;;#ASMSTART @@ -302,14 +302,14 @@ define amdgpu_kernel void @test_call_areg4() #0 { ; GFX908-NEXT: s_add_u32 s4, s4, func_areg_4@gotpcrel32@lo+4 ; GFX908-NEXT: s_addc_u32 s5, s5, func_areg_4@gotpcrel32@hi+12 ; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX908-NEXT: s_mov_b32 s14, s10 -; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX908-NEXT: s_mov_b32 s14, s10 +; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_mov_b64 s[0:1], s[20:21] -; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: ;;#ASMSTART @@ -394,8 +394,8 @@ define amdgpu_kernel void @test_call_areg32() #0 { ; GFX90A-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX90A-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: ;;#ASMSTART @@ -437,14 +437,14 @@ define amdgpu_kernel void @test_call_areg32() #0 { ; GFX908-NEXT: s_add_u32 s4, s4, func_areg_32@gotpcrel32@lo+4 ; GFX908-NEXT: s_addc_u32 s5, s5, func_areg_32@gotpcrel32@hi+12 ; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX908-NEXT: s_mov_b32 s14, s10 -; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX908-NEXT: s_mov_b32 s14, s10 +; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_mov_b64 s[0:1], s[20:21] -; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: ;;#ASMSTART @@ -529,8 +529,8 @@ define amdgpu_kernel void @test_call_areg64() #0 { ; GFX90A-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX90A-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: ;;#ASMSTART @@ -572,14 +572,14 @@ define amdgpu_kernel void @test_call_areg64() #0 { ; GFX908-NEXT: s_add_u32 s4, s4, func_areg_64@gotpcrel32@lo+4 ; GFX908-NEXT: s_addc_u32 s5, s5, func_areg_64@gotpcrel32@hi+12 ; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX908-NEXT: s_mov_b32 s14, s10 -; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX908-NEXT: s_mov_b32 s14, s10 +; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_mov_b64 s[0:1], s[20:21] -; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: ;;#ASMSTART @@ -664,8 +664,8 @@ define amdgpu_kernel void @test_call_areg31_63() #0 { ; GFX90A-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX90A-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21] ; GFX90A-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: ;;#ASMSTART @@ -707,14 +707,14 @@ define amdgpu_kernel void @test_call_areg31_63() #0 { ; GFX908-NEXT: s_add_u32 s4, s4, func_areg_31_63@gotpcrel32@lo+4 ; GFX908-NEXT: s_addc_u32 s5, s5, func_areg_31_63@gotpcrel32@hi+12 ; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX908-NEXT: s_mov_b32 s14, s10 -; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX908-NEXT: s_mov_b32 s14, s10 +; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_mov_b64 s[0:1], s[20:21] -; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: ;;#ASMSTART @@ -799,8 +799,8 @@ define amdgpu_kernel void @test_call_unknown() #0 { ; GFX90A-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX90A-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX90A-NEXT: v_mov_b32_e32 v31, v0 +; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: ;;#ASMSTART @@ -842,14 +842,14 @@ define amdgpu_kernel void @test_call_unknown() #0 { ; GFX908-NEXT: s_add_u32 s4, s4, func_unknown@gotpcrel32@lo+4 ; GFX908-NEXT: s_addc_u32 s5, s5, func_unknown@gotpcrel32@hi+12 ; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX908-NEXT: s_mov_b32 s14, s10 -; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX908-NEXT: s_mov_b32 s14, s10 +; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll index 689b306518c9b..1b7604c887b25 100644 --- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll @@ -19,8 +19,8 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt ; GCN-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-NEXT: s_add_u32 s0, s2, 40 ; GCN-NEXT: s_addc_u32 s1, s3, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 4c5c56a49fdc6..976522f7d5ea4 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -161095,7 +161095,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v9 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_mov_b32_e32 v59, v32 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_mov_b32_e32 v58, v31 @@ -161129,6 +161128,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v31, vcc ; GFX9-NEXT: v_bfe_u32 v31, v1, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_mov_b32_e32 v59, v32 ; GFX9-NEXT: v_add3_u32 v31, v31, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -161443,15 +161443,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: v_mov_b32_e32 v34, v62 ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v52 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v50 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 ; GFX9-NEXT: v_mov_b32_e32 v35, v63 @@ -161462,8 +161460,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v48 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v63, v16 +; GFX9-NEXT: v_mov_b32_e32 v34, v62 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40 ; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill @@ -161471,6 +161470,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48 ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v63, v16 ; GFX9-NEXT: v_mov_b32_e32 v62, v15 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v35 @@ -185476,6 +185476,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v27, v55 ; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v10, v32 ; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_add_f16_sdwa v60, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 @@ -185554,10 +185555,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v48, v8, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v31 ; VI-NEXT: v_add_f16_sdwa v8, v43, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v10, v32 ; VI-NEXT: v_add_f16_e32 v43, 0x200, v43 +; VI-NEXT: v_mov_b32_e32 v9, v31 ; VI-NEXT: v_or_b32_e32 v51, v3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; VI-NEXT: v_or_b32_e32 v50, v2, v0 @@ -185672,21 +185672,19 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v7 -; VI-NEXT: v_mov_b32_e32 v32, v10 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v6 -; VI-NEXT: v_mov_b32_e32 v31, v9 +; VI-NEXT: v_mov_b32_e32 v32, v10 ; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v41 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v11 ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[40:41] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, v27 ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v54, v26 ; VI-NEXT: v_mov_b32_e32 v26, v20 ; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v5 @@ -185734,6 +185732,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[61:62] ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v57 +; VI-NEXT: v_mov_b32_e32 v31, v9 ; VI-NEXT: v_mov_b32_e32 v9, v23 ; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v40 ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill @@ -185745,6 +185744,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v38 ; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v37 ; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[37:38] +; VI-NEXT: v_mov_b32_e32 v55, v27 ; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v45 ; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[52:53] ; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v51 @@ -218732,8 +218732,6 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB100_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v47, v15 :: v_dual_mov_b32 v46, v14 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v48 :: v_dual_mov_b32 v17, v49 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v45, v13 :: v_dual_mov_b32 v44, v12 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v43, v11 :: v_dual_mov_b32 v42, v10 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v41, v9 :: v_dual_mov_b32 v40, v8 @@ -218741,12 +218739,14 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v5 :: v_dual_mov_b32 v36, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v3 :: v_dual_mov_b32 v34, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v1 :: v_dual_mov_b32 v32, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v48 :: v_dual_mov_b32 v17, v49 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v50 :: v_dual_mov_b32 v19, v51 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v52 :: v_dual_mov_b32 v21, v53 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v54 :: v_dual_mov_b32 v23, v55 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v56 :: v_dual_mov_b32 v25, v57 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v58 :: v_dual_mov_b32 v27, v59 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48 @@ -230914,6 +230914,7 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; SI-NEXT: v_mov_b32_e32 v39, v59 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -231202,7 +231203,6 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[18:19], v[40:41], 16 -; SI-NEXT: v_mov_b32_e32 v39, v59 ; SI-NEXT: v_mov_b32_e32 v40, v60 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v20 ; SI-NEXT: v_lshr_b64 v[20:21], v[39:40], 16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index da908bc280e6e..05e3580e06a49 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -11639,7 +11639,6 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll index 2b48cf0f41c88..5537d5705d4d8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll @@ -55,10 +55,10 @@ define amdgpu_kernel void @bitcast_i8ptr_v16i8ptr(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 -; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index df77e7de43bf6..c8d344ff4a579 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -7905,8 +7905,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_sub_u32 s4, s4, s6 ; GFX6-NEXT: s_subb_u32 s5, s5, s7 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -9119,8 +9119,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] ; GFX6-NEXT: s_sub_u32 s4, s4, s10 ; GFX6-NEXT: s_subb_u32 s5, s5, s10 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll index 2889f37a65d97..e8fee9204c883 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll @@ -54,9 +54,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr, ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; GISEL-GFX10-NEXT: s_mov_b32 s4, use@abs32@lo ; GISEL-GFX10-NEXT: s_mov_b32 s5, use@abs32@hi +; GISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; GISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL-GFX10-NEXT: s_mov_b32 s32, 0 ; GISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -86,9 +86,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr, ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, s1 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, s2 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v3, s3 -; DAGISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; DAGISEL-GFX10-NEXT: s_mov_b32 s5, use@abs32@hi ; DAGISEL-GFX10-NEXT: s_mov_b32 s4, use@abs32@lo +; DAGISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; DAGISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51] ; DAGISEL-GFX10-NEXT: s_mov_b32 s32, 0 ; DAGISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -219,9 +219,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v29, v37 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v30, v38 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v31, v39 -; GISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; GISEL-GFX10-NEXT: s_mov_b32 s24, use@abs32@lo ; GISEL-GFX10-NEXT: s_mov_b32 s25, use@abs32@hi +; GISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; GISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[24:25] ; GISEL-GFX10-NEXT: s_endpgm @@ -347,9 +347,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v29, v34 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v30, v33 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v31, v32 -; DAGISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; DAGISEL-GFX10-NEXT: s_mov_b32 s25, use@abs32@hi ; DAGISEL-GFX10-NEXT: s_mov_b32 s24, use@abs32@lo +; DAGISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; DAGISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51] ; DAGISEL-GFX10-NEXT: s_swappc_b64 s[30:31], s[24:25] ; DAGISEL-GFX10-NEXT: s_endpgm @@ -374,9 +374,9 @@ define amdgpu_cs_chain void @alloca_and_call() { ; GISEL-GFX10: ; %bb.0: ; %.entry ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; GISEL-GFX10-NEXT: s_mov_b32 s4, use@abs32@lo ; GISEL-GFX10-NEXT: s_mov_b32 s5, use@abs32@hi +; GISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; GISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL-GFX10-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -400,9 +400,9 @@ define amdgpu_cs_chain void @alloca_and_call() { ; DAGISEL-GFX10: ; %bb.0: ; %.entry ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v0, 42 -; DAGISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; DAGISEL-GFX10-NEXT: s_mov_b32 s5, use@abs32@hi ; DAGISEL-GFX10-NEXT: s_mov_b32 s4, use@abs32@lo +; DAGISEL-GFX10-NEXT: s_mov_b64 s[0:1], s[48:49] ; DAGISEL-GFX10-NEXT: s_mov_b64 s[2:3], s[50:51] ; DAGISEL-GFX10-NEXT: buffer_store_dword v0, off, s[48:51], 0 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -449,10 +449,10 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) { ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop ; GISEL-GFX10-NEXT: ;;#ASMEND -; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v3 -; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] +; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] ; GISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103] ; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] @@ -489,10 +489,10 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) { ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND -; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v3 -; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] +; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; DAGISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103] +; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] ; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] @@ -530,10 +530,10 @@ define amdgpu_cs void @cs_to_chain_nonuniform(<3 x i32> %a, <3 x i32> %b) { ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX10-NEXT: s_bitset0_b32 s103, 21 ; GISEL-GFX10-NEXT: s_add_u32 s100, s100, s0 -; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; GISEL-GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] +; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; GISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103] +; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] ; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] ; @@ -564,10 +564,10 @@ define amdgpu_cs void @cs_to_chain_nonuniform(<3 x i32> %a, <3 x i32> %b) { ; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) ; DAGISEL-GFX10-NEXT: s_bitset0_b32 s103, 21 ; DAGISEL-GFX10-NEXT: s_add_u32 s100, s100, s0 -; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; DAGISEL-GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] +; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; DAGISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103] +; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] ; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0) @@ -930,15 +930,15 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) { ; GISEL-GFX11-LABEL: amdgpu_cs_chain_dont_realign_stack: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: s_mov_b32 s1, 2 +; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 4, v8 ; GISEL-GFX11-NEXT: s_mov_b32 s3, 4 ; GISEL-GFX11-NEXT: s_mov_b32 s2, 3 -; GISEL-GFX11-NEXT: s_mov_b32 s1, 2 ; GISEL-GFX11-NEXT: s_mov_b32 s0, 1 -; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v8 -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GISEL-GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GISEL-GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v3, s3 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc ; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GISEL-GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll index 36e2db0c4879d..a4cab10e972bf 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll @@ -60,10 +60,10 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) { ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop ; GISEL-GFX10-NEXT: ;;#ASMEND -; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v3 -; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] +; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] ; GISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103] ; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] @@ -100,10 +100,10 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) { ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND -; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v3 -; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] +; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 ; DAGISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103] +; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] ; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] @@ -583,15 +583,15 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stac ; GISEL-GFX11-LABEL: amdgpu_cs_chain_preserve_dont_realign_stack: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: s_mov_b32 s1, 2 +; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 4, v8 ; GISEL-GFX11-NEXT: s_mov_b32 s3, 4 ; GISEL-GFX11-NEXT: s_mov_b32 s2, 3 -; GISEL-GFX11-NEXT: s_mov_b32 s1, 2 ; GISEL-GFX11-NEXT: s_mov_b32 s0, 1 -; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v8 -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GISEL-GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GISEL-GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GISEL-GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v3, s3 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc ; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GISEL-GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll index fe9ec8e6ef52a..25124c88d490c 100644 --- a/llvm/test/CodeGen/AMDGPU/and.ll +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -651,8 +651,8 @@ define amdgpu_kernel void @s_and_multi_use_constant_i64(ptr addrspace(1) %out, i ; GFX6-NEXT: s_mov_b32 s0, 0x80000 ; GFX6-NEXT: s_movk_i32 s1, 0x80 ; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -674,8 +674,8 @@ define amdgpu_kernel void @s_and_multi_use_constant_i64(ptr addrspace(1) %out, i ; GFX8-NEXT: s_mov_b32 s0, 0x80000 ; GFX8-NEXT: s_movk_i32 s1, 0x80 ; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -738,10 +738,10 @@ define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, ; GFX6-NEXT: s_and_b32 s8, s7, 62 ; GFX6-NEXT: s_add_u32 s6, s6, s4 ; GFX6-NEXT: s_addc_u32 s7, 0, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_add_u32 s4, s8, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_addc_u32 s5, 0, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -765,10 +765,10 @@ define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, ; GFX8-NEXT: s_and_b32 s8, s7, 62 ; GFX8-NEXT: s_add_u32 s6, s6, s4 ; GFX8-NEXT: s_addc_u32 s7, 0, s5 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s4, s8, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_addc_u32 s5, 0, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll index 61645200690f5..f2aed101f045b 100644 --- a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll @@ -65,82 +65,82 @@ define amdgpu_kernel void @any_extend_vector_inreg_v16i8_to_v4i32(ptr addrspace( ; GFX8-NEXT: s_add_u32 s4, s10, 13 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s10, 15 -; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s10, 14 -; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s10, 8 -; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s10, 11 -; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s10, 10 -; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s10, 4 -; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s10, 6 -; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s10, 1 -; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_add_u32 s4, s10, 3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NEXT: s_add_u32 s4, s10, 3 -; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s10, 9 -; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s4, s10, 2 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s5, s11, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NEXT: s_add_u32 s4, s10, 2 +; GFX8-NEXT: s_add_u32 s0, s10, 5 ; GFX8-NEXT: flat_store_byte v[0:1], v2 -; GFX8-NEXT: s_addc_u32 s5, s11, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: s_add_u32 s0, s10, 5 -; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_addc_u32 s1, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_add_u32 s0, s10, 12 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NEXT: s_add_u32 s0, s10, 12 -; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_addc_u32 s1, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_add_u32 s0, s10, 7 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: s_add_u32 s0, s10, 7 -; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_addc_u32 s1, s11, 0 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 12cb8d2f6fb51..f04aa036fffbf 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -26,6 +26,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-TRUE16,GFX1232_DPP,GFX1232_DPP-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-FAKE16,GFX1232_DPP,GFX1232_DPP-FAKE16 %s + declare i32 @llvm.amdgcn.workitem.id.x() ; Show what the atomic optimization pass will do for global pointers. @@ -2369,12 +2370,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX8_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 ; GFX8_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc ; GFX8_ITERATIVE-NEXT: s_waitcnt vmcnt(0) @@ -2421,12 +2422,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s10, -1 ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_ITERATIVE-NEXT: s_mov_b32 s8, s2 ; GFX9_ITERATIVE-NEXT: s_mov_b32 s9, s3 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s6 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s7 ; GFX9_ITERATIVE-NEXT: buffer_atomic_add_x2 v[3:4], off, s[8:11], 0 glc ; GFX9_ITERATIVE-NEXT: s_waitcnt vmcnt(0) @@ -2841,12 +2842,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s6 ; GFX8_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s10, -1 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: s_mov_b32 s8, s2 ; GFX8_DPP-NEXT: s_mov_b32 s9, s3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s6 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s7 ; GFX8_DPP-NEXT: buffer_atomic_add_x2 v[6:7], off, s[8:11], 0 glc ; GFX8_DPP-NEXT: s_waitcnt vmcnt(0) @@ -2924,12 +2925,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB5_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s6 ; GFX9_DPP-NEXT: s_mov_b32 s11, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s10, -1 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: s_mov_b32 s8, s2 ; GFX9_DPP-NEXT: s_mov_b32 s9, s3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s6 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s7 ; GFX9_DPP-NEXT: buffer_atomic_add_x2 v[6:7], off, s[8:11], 0 glc ; GFX9_DPP-NEXT: s_waitcnt vmcnt(0) @@ -3717,8 +3718,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_mov_b32_e32 v4, v0 ; GFX1064-NEXT: v_subrev_nc_u32_e32 v3, s12, v4 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv @@ -3765,8 +3766,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_mov_b32_e32 v4, v0 ; GFX1032-NEXT: v_subrev_nc_u32_e32 v3, s10, v4 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv @@ -3816,9 +3817,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v4, v0 ; GFX1164-NEXT: v_subrev_nc_u32_e32 v3, s12, v4 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: v_mov_b32_e32 v1, v4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv @@ -3870,7 +3871,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: v_mov_b32_e32 v4, v0 ; GFX1132-NEXT: v_subrev_nc_u32_e32 v3, s10, v4 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX1132-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v0, v3 ; GFX1132-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv @@ -4148,8 +4149,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_mov_b32_e32 v4, v0 ; GFX1064-NEXT: v_subrev_nc_u32_e32 v3, s13, v4 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv @@ -4198,8 +4199,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_mov_b32_e32 v4, v0 ; GFX1032-NEXT: v_subrev_nc_u32_e32 v3, s11, v4 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv @@ -4251,9 +4252,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v4, v0 ; GFX1164-NEXT: v_subrev_nc_u32_e32 v3, s13, v4 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: v_mov_b32_e32 v1, v4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv @@ -4307,7 +4308,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: v_mov_b32_e32 v4, v0 ; GFX1132-NEXT: v_subrev_nc_u32_e32 v3, s11, v4 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX1132-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v0, v3 ; GFX1132-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv @@ -4630,8 +4631,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, v0 ; GFX1064_ITERATIVE-NEXT: v_subrev_nc_u32_e32 v3, s12, v4 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, v4 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064_ITERATIVE-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv @@ -4688,8 +4689,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, v0 ; GFX1032_ITERATIVE-NEXT: v_subrev_nc_u32_e32 v3, s8, v4 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, v4 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032_ITERATIVE-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv @@ -4752,9 +4753,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, v0 ; GFX1164_ITERATIVE-NEXT: v_subrev_nc_u32_e32 v3, s12, v4 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, v4 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164_ITERATIVE-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv @@ -4818,7 +4819,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v4, v0 ; GFX1132_ITERATIVE-NEXT: v_subrev_nc_u32_e32 v3, s8, v4 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v0, v3 ; GFX1132_ITERATIVE-NEXT: buffer_atomic_cmpswap_b32 v[0:1], off, s[4:7], 0 glc ; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv @@ -5373,8 +5374,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX1132_DPP-NEXT: v_subrev_nc_u32_e32 v5, s9, v6 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v5 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v6 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX1132_DPP-NEXT: buffer_atomic_cmpswap_b32 v[4:5], off, s[4:7], 0 glc ; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl1_inv @@ -5617,8 +5617,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: s_mov_b32 s5, s3 ; GFX8-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s12, v7 ; GFX8-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v0, v5 @@ -5674,8 +5674,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: s_mov_b32 s5, s3 ; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s12, v7 ; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v8, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, v5 @@ -5731,14 +5731,14 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064-NEXT: s_mov_b32 s5, s3 ; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_mov_b32_e32 v8, v1 ; GFX1064-NEXT: v_mov_b32_e32 v7, v0 +; GFX1064-NEXT: v_mov_b32_e32 v8, v1 ; GFX1064-NEXT: v_sub_co_u32 v5, vcc, v7, s12 ; GFX1064-NEXT: v_subrev_co_ci_u32_e32 v6, vcc, 0, v8, vcc -; GFX1064-NEXT: v_mov_b32_e32 v0, v5 ; GFX1064-NEXT: v_mov_b32_e32 v2, v7 -; GFX1064-NEXT: v_mov_b32_e32 v1, v6 +; GFX1064-NEXT: v_mov_b32_e32 v0, v5 ; GFX1064-NEXT: v_mov_b32_e32 v3, v8 +; GFX1064-NEXT: v_mov_b32_e32 v1, v6 ; GFX1064-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv @@ -5787,14 +5787,14 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032-NEXT: s_mov_b32 s5, s3 ; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_mov_b32_e32 v8, v1 ; GFX1032-NEXT: v_mov_b32_e32 v7, v0 +; GFX1032-NEXT: v_mov_b32_e32 v8, v1 ; GFX1032-NEXT: v_sub_co_u32 v5, vcc_lo, v7, s10 ; GFX1032-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, 0, v8, vcc_lo -; GFX1032-NEXT: v_mov_b32_e32 v0, v5 ; GFX1032-NEXT: v_mov_b32_e32 v2, v7 -; GFX1032-NEXT: v_mov_b32_e32 v1, v6 +; GFX1032-NEXT: v_mov_b32_e32 v0, v5 ; GFX1032-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-NEXT: v_mov_b32_e32 v1, v6 ; GFX1032-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv @@ -5845,17 +5845,17 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_mov_b32 s5, s3 ; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v8, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mov_b32_e32 v7, v0 +; GFX1164-NEXT: v_mov_b32_e32 v8, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_sub_co_u32 v5, vcc, v7, s12 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_subrev_co_ci_u32_e64 v6, null, 0, v8, vcc -; GFX1164-NEXT: v_mov_b32_e32 v0, v5 ; GFX1164-NEXT: v_mov_b32_e32 v2, v7 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164-NEXT: v_mov_b32_e32 v1, v6 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1164-NEXT: v_mov_b32_e32 v0, v5 ; GFX1164-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-NEXT: v_mov_b32_e32 v1, v6 ; GFX1164-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv @@ -5910,13 +5910,11 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX1132-NEXT: v_dual_mov_b32 v3, v8 :: v_dual_mov_b32 v2, v7 ; GFX1132-NEXT: v_sub_co_u32 v5, vcc_lo, v7, s10 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_subrev_co_ci_u32_e64 v6, null, 0, v8, vcc_lo -; GFX1132-NEXT: v_mov_b32_e32 v0, v5 -; GFX1132-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1132-NEXT: v_mov_b32_e32 v1, v6 +; GFX1132-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 ; GFX1132-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv @@ -6124,8 +6122,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: s_mov_b32 s5, s3 ; GFX8-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v10, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, v0 +; GFX8-NEXT: v_mov_b32_e32 v10, v1 ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, v9, v4 ; GFX8-NEXT: v_subb_u32_e32 v8, vcc, v10, v5, vcc ; GFX8-NEXT: v_mov_b32_e32 v0, v7 @@ -6175,10 +6173,10 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_mul_i32 s7, s9, s6 ; GFX9-NEXT: s_mul_hi_u32 s12, s8, s6 ; GFX9-NEXT: s_add_i32 s7, s12, s7 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_mul_i32 s14, s8, s6 ; GFX9-NEXT: s_mov_b64 s[12:13], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 @@ -6187,8 +6185,8 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_mov_b32 s5, s3 ; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s14, v8 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v9, v5, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, v6 @@ -6249,14 +6247,14 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_mov_b32 s5, s3 ; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064-NEXT: v_mov_b32_e32 v8, v1 ; GFX1064-NEXT: v_mov_b32_e32 v7, v0 +; GFX1064-NEXT: v_mov_b32_e32 v8, v1 ; GFX1064-NEXT: v_sub_co_u32 v5, vcc, v7, s14 ; GFX1064-NEXT: v_subrev_co_ci_u32_e32 v6, vcc, s15, v8, vcc -; GFX1064-NEXT: v_mov_b32_e32 v0, v5 ; GFX1064-NEXT: v_mov_b32_e32 v2, v7 -; GFX1064-NEXT: v_mov_b32_e32 v1, v6 +; GFX1064-NEXT: v_mov_b32_e32 v0, v5 ; GFX1064-NEXT: v_mov_b32_e32 v3, v8 +; GFX1064-NEXT: v_mov_b32_e32 v1, v6 ; GFX1064-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl1_inv @@ -6310,14 +6308,14 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: s_mov_b32 s5, s3 ; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032-NEXT: v_mov_b32_e32 v8, v1 ; GFX1032-NEXT: v_mov_b32_e32 v7, v0 +; GFX1032-NEXT: v_mov_b32_e32 v8, v1 ; GFX1032-NEXT: v_sub_co_u32 v5, vcc_lo, v7, s12 ; GFX1032-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s13, v8, vcc_lo -; GFX1032-NEXT: v_mov_b32_e32 v0, v5 ; GFX1032-NEXT: v_mov_b32_e32 v2, v7 -; GFX1032-NEXT: v_mov_b32_e32 v1, v6 +; GFX1032-NEXT: v_mov_b32_e32 v0, v5 ; GFX1032-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-NEXT: v_mov_b32_e32 v1, v6 ; GFX1032-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl1_inv @@ -6373,17 +6371,17 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_mov_b32 s5, s3 ; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v8, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_mov_b32_e32 v7, v0 +; GFX1164-NEXT: v_mov_b32_e32 v8, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_sub_co_u32 v5, vcc, v7, s14 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_subrev_co_ci_u32_e64 v6, null, s15, v8, vcc -; GFX1164-NEXT: v_mov_b32_e32 v0, v5 ; GFX1164-NEXT: v_mov_b32_e32 v2, v7 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164-NEXT: v_mov_b32_e32 v1, v6 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1164-NEXT: v_mov_b32_e32 v0, v5 ; GFX1164-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-NEXT: v_mov_b32_e32 v1, v6 ; GFX1164-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv @@ -6444,13 +6442,11 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 +; GFX1132-NEXT: v_dual_mov_b32 v3, v8 :: v_dual_mov_b32 v2, v7 ; GFX1132-NEXT: v_sub_co_u32 v5, vcc_lo, v7, s12 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_subrev_co_ci_u32_e64 v6, null, s13, v8, vcc_lo -; GFX1132-NEXT: v_mov_b32_e32 v0, v5 -; GFX1132-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1132-NEXT: v_mov_b32_e32 v1, v6 +; GFX1132-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 ; GFX1132-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl1_inv @@ -6687,8 +6683,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_ITERATIVE-NEXT: s_mov_b32 s5, s3 ; GFX8_ITERATIVE-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v10, v1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v9, v0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v10, v1 ; GFX8_ITERATIVE-NEXT: v_subrev_u32_e32 v7, vcc, s8, v9 ; GFX8_ITERATIVE-NEXT: v_subb_u32_e32 v8, vcc, v10, v6, vcc ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, v7 @@ -6759,8 +6755,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_ITERATIVE-NEXT: s_mov_b32 s5, s3 ; GFX9_ITERATIVE-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v10, v1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v9, v0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v10, v1 ; GFX9_ITERATIVE-NEXT: v_subrev_co_u32_e32 v7, vcc, s8, v9 ; GFX9_ITERATIVE-NEXT: v_subb_co_u32_e32 v8, vcc, v10, v6, vcc ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, v7 @@ -6829,14 +6825,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_ITERATIVE-NEXT: s_mov_b32 s5, s3 ; GFX1064_ITERATIVE-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v9, v1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v8, v0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v9, v1 ; GFX1064_ITERATIVE-NEXT: v_sub_co_u32 v6, vcc, v8, s8 ; GFX1064_ITERATIVE-NEXT: v_subrev_co_ci_u32_e32 v7, vcc, s9, v9, vcc -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, v6 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, v8 -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, v6 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, v9 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7 ; GFX1064_ITERATIVE-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; GFX1064_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1064_ITERATIVE-NEXT: buffer_gl1_inv @@ -6898,14 +6894,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_ITERATIVE-NEXT: s_mov_b32 s5, s3 ; GFX1032_ITERATIVE-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v9, v1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v8, v0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v9, v1 ; GFX1032_ITERATIVE-NEXT: v_sub_co_u32 v6, vcc_lo, v8, s8 ; GFX1032_ITERATIVE-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s9, v9, vcc_lo -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, v6 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, v8 -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, v6 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, v9 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7 ; GFX1032_ITERATIVE-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; GFX1032_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1032_ITERATIVE-NEXT: buffer_gl1_inv @@ -6973,17 +6969,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s5, s3 ; GFX1164_ITERATIVE-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v8, v0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v9, v1 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v6, vcc, v8, s8 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_ITERATIVE-NEXT: v_subrev_co_ci_u32_e64 v7, null, s9, v9, vcc -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, v6 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, v8 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, v6 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, v9 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7 ; GFX1164_ITERATIVE-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc ; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv @@ -7053,13 +7049,11 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v2, v8 ; GFX1132_ITERATIVE-NEXT: v_sub_co_u32 v6, vcc_lo, v8, s8 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_ITERATIVE-NEXT: v_subrev_co_ci_u32_e64 v7, null, s9, v9, vcc_lo -; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, v6 -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 -; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 ; GFX1132_ITERATIVE-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc ; GFX1132_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl1_inv @@ -7301,8 +7295,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s11 ; GFX8_DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX8_DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v10, v6 +; GFX8_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX8_DPP-NEXT: v_subrev_u32_e32 v8, vcc, s10, v10 ; GFX8_DPP-NEXT: v_subb_u32_e32 v9, vcc, v11, v0, vcc ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v8 @@ -7404,8 +7398,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s11 ; GFX9_DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX9_DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v10, v6 +; GFX9_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX9_DPP-NEXT: v_subrev_co_u32_e32 v8, vcc, s10, v10 ; GFX9_DPP-NEXT: v_subb_co_u32_e32 v9, vcc, v11, v0, vcc ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v8 @@ -7526,13 +7520,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1064_DPP-NEXT: s_mov_b32 s5, s3 ; GFX1064_DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1064_DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v8 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064_DPP-NEXT: v_sub_co_u32 v10, vcc, v12, s8 ; GFX1064_DPP-NEXT: v_subrev_co_ci_u32_e32 v11, vcc, s9, v13, vcc ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v10 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v12 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v13 ; GFX1064_DPP-NEXT: buffer_atomic_cmpswap_x2 v[8:11], off, s[4:7], 0 glc ; GFX1064_DPP-NEXT: s_waitcnt vmcnt(0) @@ -7629,13 +7623,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_mov_b32 s5, s3 ; GFX1032_DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1032_DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v14, v10 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v14, v10 ; GFX1032_DPP-NEXT: v_sub_co_u32 v11, vcc_lo, v13, s8 ; GFX1032_DPP-NEXT: v_subrev_co_ci_u32_e32 v12, vcc_lo, s9, v14, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v11 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v12 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v13 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v12 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v14 ; GFX1032_DPP-NEXT: buffer_atomic_cmpswap_x2 v[9:12], off, s[4:7], 0 glc ; GFX1032_DPP-NEXT: s_waitcnt vmcnt(0) @@ -7756,17 +7750,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: s_mov_b32 s5, s3 ; GFX1164_DPP-NEXT: .LBB11_2: ; %atomicrmw.start ; GFX1164_DPP-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v7 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v6 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v7 ; GFX1164_DPP-NEXT: v_sub_co_u32 v8, vcc, v10, s8 ; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0) -; GFX1164_DPP-NEXT: v_subrev_co_ci_u32_e64 v9, null, s9, v11, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_subrev_co_ci_u32_e64 v9, null, s9, v11, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v8 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v9 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v10 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v9 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v11 ; GFX1164_DPP-NEXT: buffer_atomic_cmpswap_b64 v[6:9], off, s[4:7], 0 glc ; GFX1164_DPP-NEXT: s_waitcnt vmcnt(0) @@ -7875,7 +7869,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_subrev_co_ci_u32_e64 v11, null, s9, v13, vcc_lo ; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v10 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, v11 :: v_dual_mov_b32 v10, v12 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, v12 :: v_dual_mov_b32 v9, v11 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v13 ; GFX1132_DPP-NEXT: buffer_atomic_cmpswap_b64 v[8:11], off, s[4:7], 0 glc ; GFX1132_DPP-NEXT: s_waitcnt vmcnt(0) @@ -9042,9 +9036,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_and_b32_e32 v0, s12, v0 ; GFX1064-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -9099,9 +9093,9 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1032-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -9157,13 +9151,13 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0 ; GFX1164-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0) ; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -9222,13 +9216,13 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0 ; GFX1164-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0) ; GFX1164-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -9287,11 +9281,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1132-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v0, s12, v1 ; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -9349,11 +9343,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1132-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v0, s12, v1 ; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -9412,12 +9406,12 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -9478,12 +9472,12 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -9546,11 +9540,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1232-TRUE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v0, s12, v1 ; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -9612,11 +9606,11 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1232-FAKE16-NEXT: .LBB13_2: ; %atomicrmw.start ; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v0, s12, v1 ; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -10943,9 +10937,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_and_b32_e32 v0, s12, v0 ; GFX1064-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -11000,9 +10994,9 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1032-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -11058,13 +11052,13 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0 ; GFX1164-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0) ; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -11122,13 +11116,13 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0 ; GFX1164-FAKE16-NEXT: s_waitcnt_depctr depctr_va_vdst(0) ; GFX1164-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -11187,11 +11181,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1132-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v0, s12, v1 ; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -11248,11 +11242,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1132-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v0, s12, v1 ; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -11311,12 +11305,12 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264-TRUE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -11376,12 +11370,12 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264-FAKE16-NEXT: v_add_nc_u32_e32 v0, s14, v1 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, s12, v0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_and_or_b32 v0, v1, s13, v0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -11444,11 +11438,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-TRUE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v0, s12, v1 ; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -11509,11 +11503,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-FAKE16-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_add_nc_u32_e32 v0, s12, v1 +; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v0, s12, v1 ; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_and_or_b32 v0, v1, s11, v0 -; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -12065,10 +12059,10 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1064-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s9, v1 +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_add_f16_e32 v0, s8, v0 ; GFX1064-NEXT: v_lshlrev_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX1064-NEXT: v_and_or_b32 v0, v1, s10, v0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -12106,10 +12100,10 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1032-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s2, v1 +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_add_f16_e32 v0, s8, v0 ; GFX1032-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX1032-NEXT: v_and_or_b32 v0, v1, s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -12147,15 +12141,15 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 ; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0 ; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s10, v0 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -12194,15 +12188,15 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_and_or_b32 v0, v1, s10, v0 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -12241,15 +12235,16 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1132-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 ; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1132-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s3, v0 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -12287,15 +12282,16 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1132-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1132-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1132-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_and_or_b32 v0, v1, s3, v0 -; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -12333,15 +12329,15 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 ; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0 ; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s10, v0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -12380,15 +12376,15 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0 ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1264-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_and_or_b32 v0, v1, s10, v0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -12427,15 +12423,16 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1232-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 ; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1232-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s3, v0 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -12473,15 +12470,16 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-FAKE16-NEXT: s_mov_b32 s6, -1 ; GFX1232-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1232-FAKE16-NEXT: v_add_f16_e32 v0, s8, v0 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_and_or_b32 v0, v1, s3, v0 -; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -12674,9 +12672,9 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1064-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX1064-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1064-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX1064-NEXT: v_and_or_b32 v0, v1, s9, v0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v1 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -12721,9 +12719,9 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1032-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX1032-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1032-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX1032-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -12774,12 +12772,12 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1164-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h ; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s8, v2 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s9, v0 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -12831,12 +12829,12 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s8, v0 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_and_or_b32 v0, v1, s9, v0 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -12889,12 +12887,13 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1132-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1132-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h ; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v2 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -12943,13 +12942,13 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1132-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX1132-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX1132-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v3 :: v_dual_mov_b32 v3, v1 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX1132-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -13000,12 +12999,12 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1264-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX1264-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h ; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s8, v2 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s9, v0 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -13056,12 +13055,12 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1264-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1264-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX1264-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX1264-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s8, v0 -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_and_or_b32 v0, v1, s9, v0 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -13113,12 +13112,13 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1232-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX1232-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h ; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v2 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -13167,13 +13167,13 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1232-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1232-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX1232-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v3 :: v_dual_mov_b32 v3, v1 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_and_or_b32 v0, v1, s8, v0 -; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -13720,8 +13720,8 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1064-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 ; GFX1064-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc ; GFX1064-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1] -; GFX1064-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) @@ -13769,8 +13769,8 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1032-NEXT: v_cmp_u_f32_e64 s0, v0, v0 ; GFX1032-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo ; GFX1032-NEXT: v_cndmask_b32_e64 v0, v3, v5, s0 -; GFX1032-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) @@ -13885,9 +13885,9 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc ; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1] -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -14001,9 +14001,10 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo ; GFX1132-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s0 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 -; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 @@ -14115,9 +14116,9 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1264-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc ; GFX1264-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1264-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1] -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1264-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -14228,9 +14229,10 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1232-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo ; GFX1232-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s0 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 -; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 785aee07a990e..a013c7e7ab7b7 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -2019,8 +2019,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] @@ -2067,8 +2067,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -2113,8 +2113,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -2159,8 +2159,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: ds_add_rtn_u64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -2211,8 +2211,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB6_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1164_ITERATIVE-NEXT: ds_add_rtn_u64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -2877,8 +2877,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] @@ -2909,8 +2909,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 ; GFX9_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -2940,8 +2940,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 ; GFX1064_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -2971,8 +2971,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -3007,8 +3007,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, s1 ; GFX1164_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -3041,8 +3041,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1132_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1132_ITERATIVE-NEXT: s_cbranch_execz .LBB7_4 ; GFX1132_ITERATIVE-NEXT: ; %bb.3: -; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX1132_ITERATIVE-NEXT: v_mov_b32_e32 v0, s0 ; GFX1132_ITERATIVE-NEXT: ds_add_u64 v2, v[0:1] ; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_ITERATIVE-NEXT: buffer_gl0_inv @@ -5433,8 +5433,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] @@ -5481,8 +5481,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -5527,8 +5527,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -5573,8 +5573,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: ds_sub_rtn_u64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -5625,8 +5625,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB14_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1164_ITERATIVE-NEXT: ds_sub_rtn_u64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -6907,8 +6907,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] @@ -6953,8 +6953,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -6997,8 +6997,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -7042,8 +7042,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: ds_and_rtn_b64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -7092,8 +7092,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB16_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1164_ITERATIVE-NEXT: ds_and_rtn_b64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -8255,8 +8255,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] @@ -8301,8 +8301,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -8345,8 +8345,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -8390,8 +8390,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: ds_or_rtn_b64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -8440,8 +8440,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB18_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1164_ITERATIVE-NEXT: ds_or_rtn_b64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -9603,8 +9603,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] @@ -9649,8 +9649,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -9693,8 +9693,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -9738,8 +9738,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: ds_xor_rtn_b64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -9788,8 +9788,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB20_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1164_ITERATIVE-NEXT: ds_xor_rtn_b64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -11202,8 +11202,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] @@ -11257,8 +11257,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -11308,8 +11308,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -11358,8 +11358,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: ds_max_rtn_i64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -11415,8 +11415,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB23_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1164_ITERATIVE-NEXT: ds_max_rtn_i64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -13027,8 +13027,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] @@ -13082,8 +13082,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -13133,8 +13133,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -13183,8 +13183,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: ds_min_rtn_i64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -13240,8 +13240,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB26_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1164_ITERATIVE-NEXT: ds_min_rtn_i64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -14846,8 +14846,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] @@ -14900,8 +14900,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -14950,8 +14950,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -14999,8 +14999,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: ds_max_rtn_u64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -15055,8 +15055,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB29_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1164_ITERATIVE-NEXT: ds_max_rtn_u64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -16657,8 +16657,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX8_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX8_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX8_ITERATIVE-NEXT: ; %bb.3: -; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, -1 ; GFX8_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] @@ -16711,8 +16711,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX9_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX9_ITERATIVE-NEXT: ; %bb.3: -; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX9_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -16761,8 +16761,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1064_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX1064_ITERATIVE-NEXT: ; %bb.3: -; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1064_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -16810,8 +16810,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_ITERATIVE-NEXT: s_xor_b32 s2, exec_lo, s2 ; GFX1032_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX1032_ITERATIVE-NEXT: ; %bb.3: -; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v4, s1 ; GFX1032_ITERATIVE-NEXT: v_mov_b32_e32 v3, s0 ; GFX1032_ITERATIVE-NEXT: ds_min_rtn_u64 v[3:4], v0, v[3:4] ; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) @@ -16866,8 +16866,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_cbranch_execz .LBB32_4 ; GFX1164_ITERATIVE-NEXT: ; %bb.3: -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, s0 ; GFX1164_ITERATIVE-NEXT: ds_min_rtn_u64 v[2:3], v4, v[2:3] ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll index ce8ffab77ac85..8b1dc1e4bc193 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll @@ -116,8 +116,8 @@ define amdgpu_kernel void @flat_atomic_usub_cond_rtn_u32(ptr %addr, i32 %in, ptr ; GFX9-SDAG-NEXT: v_subrev_u32_e32 v0, s6, v1 ; GFX9-SDAG-NEXT: s_addc_u32 s9, s1, 0 ; GFX9-SDAG-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -172,8 +172,8 @@ define amdgpu_kernel void @flat_atomic_usub_cond_rtn_u32(ptr %addr, i32 %in, ptr ; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, s6, v1 ; GFX9-GISEL-NEXT: s_addc_u32 s9, s1, 0 ; GFX9-GISEL-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll index 4af2d58b01518..043b6ffbc4018 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll @@ -20,8 +20,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: s_endpgm entry: @@ -44,8 +44,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 ; GFX12-GISEL-NEXT: s_endpgm entry: @@ -75,8 +75,8 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll index 4bc6220b4d9a0..3ad770a95ad2a 100644 --- a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll @@ -108,14 +108,14 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_cbranch_vccz .LBB0_10 ; CHECK-NEXT: ; %bb.9: ; %.preheader1856.preheader.i.i.i3325 ; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=2 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v28 ; CHECK-NEXT: s_mov_b64 s[24:25], 0 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v28 ; CHECK-NEXT: v_accvgpr_write_b32 a1, v29 ; CHECK-NEXT: s_mov_b64 s[8:9], 0 ; CHECK-NEXT: s_branch .LBB0_6 ; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_7 Depth=2 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v30 ; CHECK-NEXT: s_mov_b64 s[24:25], -1 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v30 ; CHECK-NEXT: v_accvgpr_write_b32 a1, v31 ; CHECK-NEXT: s_mov_b64 s[8:9], s[18:19] ; CHECK-NEXT: s_branch .LBB0_6 diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll index e33b9ab0eda9e..c4216f5cb4d84 100644 --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -54,8 +54,8 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: s_lshr_b32 s2, s2, 16 -; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: flat_store_short v[0:1], v2 ; GISEL-NEXT: s_endpgm @@ -150,11 +150,11 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GISEL-NEXT: flat_load_ushort v0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 ; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: flat_store_short v[0:1], v2 ; GISEL-NEXT: s_endpgm ; @@ -233,8 +233,8 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) # ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b32 s2, s2 -; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: flat_store_dword v[0:1], v2 ; GISEL-NEXT: s_endpgm @@ -316,10 +316,10 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: flat_load_dword v0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v2, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: flat_store_dword v[0:1], v2 ; GISEL-NEXT: s_endpgm ; @@ -394,8 +394,8 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: s_brev_b32 s3, s3 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-NEXT: s_endpgm @@ -558,8 +558,8 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] ; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-NEXT: s_endpgm @@ -725,10 +725,10 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> ; GISEL-NEXT: s_brev_b64 s[0:1], s[0:1] ; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] ; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GISEL-NEXT: v_mov_b32_e32 v5, s5 ; GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GISEL-NEXT: s_endpgm @@ -821,9 +821,9 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v4, v1 ; GISEL-NEXT: v_bfrev_b32_e32 v5, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_bfrev_b32_e32 v6, v3 ; GISEL-NEXT: v_bfrev_b32_e32 v7, v2 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll index ad0d6d8016ad6..782a8507a9472 100644 --- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll +++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll @@ -29,11 +29,11 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: s_mov_b32 s18, 0 ; CHECK-NEXT: s_branch .LBB0_6 ; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: s_mov_b32 s13, s12 ; CHECK-NEXT: s_mov_b32 s14, s12 ; CHECK-NEXT: s_mov_b32 s15, s12 -; CHECK-NEXT: s_mov_b32 s13, s12 -; CHECK-NEXT: s_mov_b64 s[50:51], s[14:15] ; CHECK-NEXT: s_mov_b64 s[48:49], s[12:13] +; CHECK-NEXT: s_mov_b64 s[50:51], s[14:15] ; CHECK-NEXT: s_branch .LBB0_8 ; CHECK-NEXT: .LBB0_5: ; %if.then263.i.i ; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s53, 0 @@ -80,8 +80,8 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CHECK-NEXT: .LBB0_9: ; %kernel_direct_lighting.exit ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x20 -; CHECK-NEXT: v_mov_b32_e32 v0, s48 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, s48 ; CHECK-NEXT: v_mov_b32_e32 v1, s49 ; CHECK-NEXT: v_mov_b32_e32 v2, s50 ; CHECK-NEXT: v_mov_b32_e32 v3, s51 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index efb75e95212b2..9d828ad997f3c 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s + ; -------------------------------------------------------------------- ; float ; -------------------------------------------------------------------- @@ -66,8 +67,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -569,8 +570,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: ; Child Loop BB2_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f32_e32 v7, v8, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -625,8 +626,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX8-NEXT: ; Child Loop BB2_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f32_e32 v7, v8, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -680,8 +681,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX7-NEXT: ; Child Loop BB2_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f32_e32 v7, v8, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v7 ; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_mov_b32_e32 v6, v7 ; GFX7-NEXT: v_mov_b32_e32 v7, v8 ; GFX7-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -737,8 +738,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX6-NEXT: ; Child Loop BB2_4 Depth 2 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_add_f32_e32 v7, v8, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, v7 ; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mov_b32_e32 v6, v7 ; GFX6-NEXT: v_mov_b32_e32 v7, v8 ; GFX6-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 @@ -824,8 +825,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -1208,7 +1209,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1236,8 +1237,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -1426,7 +1427,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1454,8 +1455,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -1644,7 +1645,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1672,8 +1673,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -1843,10 +1844,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1883,10 +1884,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1914,11 +1915,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: v_mov_b32_e32 v1, v7 -; GFX10-NEXT: v_mov_b32_e32 v2, v8 -; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -1955,10 +1956,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 -; GFX908-NEXT: v_mov_b32_e32 v1, v7 ; GFX908-NEXT: v_mov_b32_e32 v2, v8 ; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -1985,10 +1986,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-NEXT: v_mov_b32_e32 v1, v7 ; GFX8-NEXT: v_mov_b32_e32 v2, v8 ; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -2015,9 +2016,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v3, v9 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2047,9 +2048,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v0, v6 ; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v3, v9 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2336,11 +2337,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: ; Child Loop BB10_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[11:12], v[13:14], v[5:6] +; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v9 @@ -2434,11 +2435,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: ; Child Loop BB10_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] +; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v9 @@ -2499,12 +2500,12 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX10-NEXT: ; Child Loop BB10_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] +; GFX10-NEXT: v_mov_b32_e32 v2, v13 +; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; GFX10-NEXT: v_mov_b32_e32 v1, v12 -; GFX10-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; GFX10-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX10-NEXT: v_readfirstlane_b32 s8, v9 @@ -2595,10 +2596,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] ; GFX908-NEXT: s_mov_b64 s[12:13], exec -; GFX908-NEXT: v_mov_b32_e32 v0, v11 -; GFX908-NEXT: v_mov_b32_e32 v1, v12 ; GFX908-NEXT: v_mov_b32_e32 v2, v13 ; GFX908-NEXT: v_mov_b32_e32 v3, v14 +; GFX908-NEXT: v_mov_b32_e32 v0, v11 +; GFX908-NEXT: v_mov_b32_e32 v1, v12 ; GFX908-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v9 @@ -2657,10 +2658,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6] ; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mov_b32_e32 v0, v11 -; GFX8-NEXT: v_mov_b32_e32 v1, v12 ; GFX8-NEXT: v_mov_b32_e32 v2, v13 ; GFX8-NEXT: v_mov_b32_e32 v3, v14 +; GFX8-NEXT: v_mov_b32_e32 v0, v11 +; GFX8-NEXT: v_mov_b32_e32 v1, v12 ; GFX8-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v9 @@ -2836,10 +2837,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -2876,10 +2877,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -2907,11 +2908,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: v_mov_b32_e32 v1, v7 -; GFX10-NEXT: v_mov_b32_e32 v2, v8 -; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -2966,10 +2967,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 -; GFX908-NEXT: v_mov_b32_e32 v1, v7 ; GFX908-NEXT: v_mov_b32_e32 v2, v8 ; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -2996,10 +2997,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-NEXT: v_mov_b32_e32 v1, v7 ; GFX8-NEXT: v_mov_b32_e32 v2, v8 ; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3026,9 +3027,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v3, v9 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3058,9 +3059,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v0, v6 ; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v3, v9 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3097,10 +3098,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[4:5] +; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -3137,10 +3138,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -3168,11 +3169,11 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; GFX10-NEXT: v_mov_b32_e32 v1, v7 -; GFX10-NEXT: v_mov_b32_e32 v2, v8 -; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -3209,10 +3210,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v6 -; GFX908-NEXT: v_mov_b32_e32 v1, v7 ; GFX908-NEXT: v_mov_b32_e32 v2, v8 ; GFX908-NEXT: v_mov_b32_e32 v3, v9 +; GFX908-NEXT: v_mov_b32_e32 v0, v6 +; GFX908-NEXT: v_mov_b32_e32 v1, v7 ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 @@ -3239,10 +3240,10 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-NEXT: v_mov_b32_e32 v1, v7 ; GFX8-NEXT: v_mov_b32_e32 v2, v8 ; GFX8-NEXT: v_mov_b32_e32 v3, v9 +; GFX8-NEXT: v_mov_b32_e32 v0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, v7 ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -3269,9 +3270,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v3, v9 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3301,9 +3302,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v0, v6 ; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v3, v9 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3354,13 +3355,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV @@ -3401,7 +3403,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -3409,7 +3412,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -3479,12 +3482,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -3519,14 +3523,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -3558,11 +3563,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3700,11 +3705,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3741,11 +3746,11 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3992,11 +3997,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f16_e32 v1, v1, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v4, v1 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4130,11 +4135,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4169,11 +4174,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4236,12 +4241,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v9, v7 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -4317,14 +4323,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 ; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v9, v7 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_add_f16_e32 v6, v6, v5 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -4463,12 +4470,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v7 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -4538,14 +4546,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_f16_e32 v6, v6, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -4609,12 +4618,12 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX10-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; GFX10-NEXT: s_mov_b32 s6, exec_lo ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f16_e32 v6, v6, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; GFX10-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 @@ -4747,8 +4756,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: v_add_f16_e32 v6, v6, v5 ; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX908-NEXT: v_mov_b32_e32 v9, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v9, v7 ; GFX908-NEXT: v_mov_b32_e32 v8, v6 ; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -4814,8 +4823,8 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX8-NEXT: v_and_b32_e32 v8, v7, v11 ; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 -; GFX8-NEXT: v_mov_b32_e32 v9, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v9, v7 ; GFX8-NEXT: v_mov_b32_e32 v8, v6 ; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -5033,14 +5042,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v1, v4, v6 :: v_dual_mov_b32 v4, v2 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV @@ -5092,14 +5101,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v3 :: v_dual_mov_b32 v3, v1 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -5189,13 +5198,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v4, v6 :: v_dual_mov_b32 v4, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -5241,13 +5250,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v3 :: v_dual_mov_b32 v3, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -5287,9 +5296,9 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5612,14 +5621,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v4, v5 :: v_dual_mov_b32 v5, v1 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -5758,13 +5767,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v4, v5 :: v_dual_mov_b32 v5, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -5803,9 +5812,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6077,14 +6086,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v6, v9, v12 :: v_dual_mov_b32 v9, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -6177,8 +6186,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX12-FAKE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -6335,14 +6343,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v9, v12 :: v_dual_mov_b32 v9, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -6430,8 +6438,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX11-FAKE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -6654,8 +6661,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc ; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -6728,8 +6735,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -6797,8 +6804,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -6867,8 +6874,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 @@ -6948,7 +6955,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -6976,8 +6983,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -7080,9 +7087,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7127,9 +7134,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7468,8 +7475,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v6, v7 -; GFX11-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-NEXT: v_dual_mov_b32 v6, v7 :: v_dual_mov_b32 v7, v8 ; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -7613,8 +7619,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_add_f16 v7, v8, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: v_mov_b32_e32 v7, v8 ; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7671,8 +7677,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_add_f16_sdwa v6, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v7, v8, v5 ; GFX8-NEXT: v_or_b32_e32 v7, v7, v6 -; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: v_mov_b32_e32 v7, v8 ; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7743,9 +7749,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_add_f32_e32 v8, v8, v11 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-NEXT: v_mov_b32_e32 v9, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX7-NEXT: v_mov_b32_e32 v9, v6 ; GFX7-NEXT: v_mov_b32_e32 v8, v5 ; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7902,7 +7908,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -7930,8 +7936,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -8050,9 +8056,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8097,9 +8103,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8414,7 +8420,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -8442,8 +8448,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -8562,9 +8568,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8609,9 +8615,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8983,10 +8989,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -9037,7 +9044,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -9082,8 +9089,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -9452,9 +9459,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -9497,8 +9505,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -9892,8 +9900,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX11-TRUE16-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -9976,8 +9983,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v5, 0x7060302 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX11-FAKE16-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -10203,8 +10209,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX908-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc ; GFX908-NEXT: v_perm_b32 v6, v6, v5, s15 -; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -10278,8 +10284,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v5, 16 -; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -10343,13 +10349,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: v_add_f32_e32 v8, v8, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_alignbit_b32 v5, v8, v5, 16 -; GFX7-NEXT: v_mov_b32_e32 v9, v6 +; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 ; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_mov_b32_e32 v9, v6 ; GFX7-NEXT: v_mov_b32_e32 v8, v5 ; GFX7-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -10422,8 +10428,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v7 ; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 @@ -10553,10 +10559,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -10607,7 +10614,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -10652,8 +10659,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -11022,9 +11029,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -11067,8 +11075,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -11394,10 +11402,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -11448,7 +11457,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -11493,8 +11502,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -11863,9 +11872,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -11908,8 +11918,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -12277,9 +12287,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -12322,8 +12333,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -12612,8 +12623,8 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_ ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index f3bf8c664f7a6..bbb7779a36ad8 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s + ; -------------------------------------------------------------------- ; float ; -------------------------------------------------------------------- @@ -593,8 +594,8 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v5, v7, v7 ; GFX908-NEXT: v_max_f32_e32 v6, v5, v8 -; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -651,8 +652,8 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v7 ; GFX8-NEXT: v_max_f32_e32 v6, v5, v8 -; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -798,9 +799,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX11-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_max_f32 v4, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -829,9 +830,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -1186,10 +1187,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1229,10 +1228,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1367,10 +1364,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1408,10 +1405,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1459,9 +1456,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v10, v3 ; GFX908-NEXT: v_mov_b32_e32 v9, v2 +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v8, v1 ; GFX908-NEXT: v_mov_b32_e32 v7, v0 ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc @@ -1489,9 +1486,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v10, v3 ; GFX8-NEXT: v_mov_b32_e32 v9, v2 +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc @@ -1567,12 +1564,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[5:6] ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v9 @@ -1667,12 +1664,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[11:12], v[0:1], v[5:6] ; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v9 @@ -1794,11 +1791,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v2, v13 +; GFX908-NEXT: v_mov_b32_e32 v3, v14 ; GFX908-NEXT: v_max_f64 v[11:12], v[0:1], v[5:6] ; GFX908-NEXT: v_mov_b32_e32 v0, v11 ; GFX908-NEXT: v_mov_b32_e32 v1, v12 -; GFX908-NEXT: v_mov_b32_e32 v2, v13 -; GFX908-NEXT: v_mov_b32_e32 v3, v14 ; GFX908-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v9 @@ -1858,11 +1855,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v2, v13 +; GFX8-NEXT: v_mov_b32_e32 v3, v14 ; GFX8-NEXT: v_max_f64 v[11:12], v[0:1], v[5:6] ; GFX8-NEXT: v_mov_b32_e32 v0, v11 ; GFX8-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NEXT: v_mov_b32_e32 v3, v14 ; GFX8-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v9 @@ -1970,10 +1967,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -2013,10 +2008,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -2236,10 +2229,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -2279,10 +2270,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -2432,7 +2421,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -2440,7 +2430,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV @@ -2482,7 +2472,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2491,7 +2482,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -2564,14 +2555,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -2608,7 +2600,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2616,7 +2609,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -2649,12 +2642,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2798,11 +2791,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2839,11 +2832,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2941,7 +2934,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2950,7 +2944,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -3064,7 +3058,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3072,7 +3067,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -3104,12 +3099,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3249,11 +3244,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3288,11 +3283,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3356,14 +3351,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v4.h, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v4.h, v4.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v5 ; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3448,8 +3443,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX12-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3591,14 +3585,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v4.h, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v4.h, v4.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v5 ; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3677,8 +3671,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX11-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3886,8 +3879,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: v_max_f16_e32 v4, v4, v10 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3955,8 +3948,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -4174,14 +4167,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v1, v4, v6 :: v_dual_mov_b32 v4, v2 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV @@ -4233,14 +4226,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v3 :: v_dual_mov_b32 v3, v1 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -4330,13 +4323,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v4, v6 :: v_dual_mov_b32 v4, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -4382,13 +4375,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v3 :: v_dual_mov_b32 v3, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -4428,9 +4421,9 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4755,14 +4748,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v4, v5 :: v_dual_mov_b32 v5, v1 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -4901,13 +4894,13 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v4, v5 :: v_dual_mov_b32 v5, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -4946,9 +4939,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5222,14 +5215,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v6, v9, v12 :: v_dual_mov_b32 v9, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5322,8 +5315,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX12-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5480,14 +5472,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v9, v12 :: v_dual_mov_b32 v9, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5575,8 +5567,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX11-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5799,8 +5790,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc ; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -5873,8 +5864,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -5943,8 +5934,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -6014,8 +6005,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 @@ -6135,11 +6126,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -6168,9 +6160,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -6297,9 +6289,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6344,9 +6336,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6472,9 +6464,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6708,8 +6700,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_max_num_f16 v6, v5, v8 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6834,8 +6825,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v6, v5, v8 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -7014,8 +7004,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v5, v7, v7 ; GFX908-NEXT: v_pk_max_f16 v6, v5, v8 -; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7076,8 +7066,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_max_f16_sdwa v5, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v6, v6, v9 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v5 -; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7148,9 +7138,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_max_f32_e32 v8, v8, v11 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-NEXT: v_mov_b32_e32 v9, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX7-NEXT: v_mov_b32_e32 v9, v6 ; GFX7-NEXT: v_mov_b32_e32 v8, v5 ; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7308,11 +7298,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v6 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV @@ -7366,7 +7357,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX12-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -7457,10 +7448,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -7511,7 +7503,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -7556,8 +7548,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -7875,9 +7867,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -8011,9 +8004,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -8056,8 +8050,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8346,8 +8340,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX12-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8436,8 +8429,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX12-FAKE16-NEXT: v_perm_b32 v6, v6, v5, 0x7060302 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX12-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8596,8 +8588,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX11-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8680,8 +8671,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v5, 0x7060302 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX11-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8907,8 +8897,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX908-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc ; GFX908-NEXT: v_perm_b32 v6, v6, v5, s15 -; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -8982,8 +8972,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v5, 16 -; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -9047,13 +9037,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v10 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v11 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; GFX7-NEXT: v_mov_b32_e32 v9, v6 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 ; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_mov_b32_e32 v9, v6 ; GFX7-NEXT: v_mov_b32_e32 v8, v5 ; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -9121,13 +9111,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7 ; GFX6-NEXT: v_max_f32_e32 v4, v4, v9 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v10 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16 ; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index d1dc76f321375..d3e43f6945b29 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s + ; -------------------------------------------------------------------- ; float ; -------------------------------------------------------------------- @@ -593,8 +594,8 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f32_e32 v5, v7, v7 ; GFX908-NEXT: v_min_f32_e32 v6, v5, v8 -; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -651,8 +652,8 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v7 ; GFX8-NEXT: v_min_f32_e32 v6, v5, v8 -; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -798,9 +799,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX11-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX11-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_min_f32 v4, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -829,9 +830,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -1186,10 +1187,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1229,10 +1228,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -1367,10 +1364,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1408,10 +1405,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 ; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1459,9 +1456,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v10, v3 ; GFX908-NEXT: v_mov_b32_e32 v9, v2 +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v8, v1 ; GFX908-NEXT: v_mov_b32_e32 v7, v0 ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc @@ -1489,9 +1486,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v10, v3 ; GFX8-NEXT: v_mov_b32_e32 v9, v2 +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc @@ -1567,12 +1564,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] +; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: s_mov_b32 s2, exec_lo ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[5:6] ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v9 @@ -1667,12 +1664,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] +; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: s_mov_b32 s2, exec_lo ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[11:12], v[0:1], v[5:6] ; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 -; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 ; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v9 @@ -1794,11 +1791,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v2, v13 +; GFX908-NEXT: v_mov_b32_e32 v3, v14 ; GFX908-NEXT: v_min_f64 v[11:12], v[0:1], v[5:6] ; GFX908-NEXT: v_mov_b32_e32 v0, v11 ; GFX908-NEXT: v_mov_b32_e32 v1, v12 -; GFX908-NEXT: v_mov_b32_e32 v2, v13 -; GFX908-NEXT: v_mov_b32_e32 v3, v14 ; GFX908-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: v_readfirstlane_b32 s8, v9 @@ -1858,11 +1855,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v2, v13 +; GFX8-NEXT: v_mov_b32_e32 v3, v14 ; GFX8-NEXT: v_min_f64 v[11:12], v[0:1], v[5:6] ; GFX8-NEXT: v_mov_b32_e32 v0, v11 ; GFX8-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-NEXT: v_mov_b32_e32 v2, v13 -; GFX8-NEXT: v_mov_b32_e32 v3, v14 ; GFX8-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_readfirstlane_b32 s8, v9 @@ -1970,10 +1967,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -2013,10 +2008,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -2236,10 +2229,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[0:1], v[6:7] -; GFX12-NEXT: v_mov_b32_e32 v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX12-NEXT: v_mov_b32_e32 v3, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -2279,10 +2270,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f64 v[2:3], v[0:1], v[6:7] -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -2432,7 +2421,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -2440,7 +2430,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV @@ -2482,7 +2472,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 ; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v5 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2491,7 +2482,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -2564,14 +2555,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -2608,7 +2600,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2616,7 +2609,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -2649,12 +2642,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2798,11 +2791,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2839,11 +2832,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2941,7 +2934,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0 ; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2950,7 +2944,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -3064,7 +3058,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3072,7 +3067,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -3104,12 +3099,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3249,11 +3244,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-NEXT: v_mov_b32_e32 v4, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3288,11 +3283,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3356,14 +3351,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v4.h, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v4.h, v4.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v5 ; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3448,8 +3443,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX12-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3591,14 +3585,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v4.h, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v4.h, v4.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v5 ; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3677,8 +3671,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX11-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3886,8 +3879,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX908-NEXT: v_min_f16_e32 v4, v4, v10 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3955,8 +3948,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -4174,14 +4167,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v1, v4, v6 :: v_dual_mov_b32 v4, v2 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV @@ -4233,14 +4226,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v3 :: v_dual_mov_b32 v3, v1 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -4330,13 +4323,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v4, v6 :: v_dual_mov_b32 v4, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -4382,13 +4375,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v3 :: v_dual_mov_b32 v3, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -4428,9 +4421,9 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4755,14 +4748,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v4, v5 :: v_dual_mov_b32 v5, v1 ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX12-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -4901,13 +4894,13 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v4, v5 :: v_dual_mov_b32 v5, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -4946,9 +4939,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5222,14 +5215,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v6, v9, v12 :: v_dual_mov_b32 v9, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5322,8 +5315,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX12-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5480,14 +5472,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v9, v12 :: v_dual_mov_b32 v9, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5575,8 +5567,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 ; GFX11-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5799,8 +5790,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc ; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v4, v5 ; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -5873,8 +5864,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -5943,8 +5934,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_mov_b32_e32 v4, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -6014,8 +6005,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 ; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mov_b32_e32 v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 @@ -6135,11 +6126,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -6168,9 +6160,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -6297,9 +6289,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6344,9 +6336,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6472,9 +6464,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6708,8 +6700,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_pk_min_num_f16 v6, v5, v8 -; GFX12-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0 @@ -6834,8 +6825,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_pk_min_f16 v6, v5, v8 -; GFX11-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0 @@ -7014,8 +7004,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_pk_max_f16 v5, v7, v7 ; GFX908-NEXT: v_pk_min_f16 v6, v5, v8 -; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7076,8 +7066,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX8-NEXT: v_min_f16_sdwa v5, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v6, v6, v9 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v5 -; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7148,9 +7138,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX7-NEXT: v_min_f32_e32 v8, v8, v11 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX7-NEXT: v_mov_b32_e32 v9, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX7-NEXT: v_mov_b32_e32 v9, v6 ; GFX7-NEXT: v_mov_b32_e32 v8, v5 ; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -7308,11 +7298,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v6 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v5 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV @@ -7366,7 +7357,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX12-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -7457,10 +7448,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv @@ -7511,7 +7503,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v0, v5 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -7556,8 +7548,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv @@ -7875,9 +7867,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v0 ; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV @@ -8011,9 +8004,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: buffer_gl1_inv @@ -8056,8 +8050,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 -; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -8346,8 +8340,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX12-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8436,8 +8429,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX12-FAKE16-NEXT: v_perm_b32 v6, v6, v5, 0x7060302 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX12-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8596,8 +8588,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX11-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8680,8 +8671,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v11, v13, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v5, 0x7060302 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v6 :: v_dual_mov_b32 v6, v7 ; GFX11-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -8907,8 +8897,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX908-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc ; GFX908-NEXT: v_perm_b32 v6, v6, v5, s15 -; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: s_mov_b64 s[12:13], exec +; GFX908-NEXT: v_mov_b32_e32 v5, v6 ; GFX908-NEXT: v_mov_b32_e32 v6, v7 ; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 @@ -8982,8 +8972,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v5, 16 -; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: s_mov_b64 s[12:13], exec +; GFX8-NEXT: v_mov_b32_e32 v5, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 ; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 @@ -9047,13 +9037,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v10 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v11 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; GFX7-NEXT: v_mov_b32_e32 v9, v6 +; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 ; GFX7-NEXT: s_mov_b64 s[12:13], exec +; GFX7-NEXT: v_mov_b32_e32 v9, v6 ; GFX7-NEXT: v_mov_b32_e32 v8, v5 ; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 @@ -9121,13 +9111,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7 ; GFX6-NEXT: v_min_f32_e32 v4, v4, v9 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_f32_e32 v7, v7, v10 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16 ; GFX6-NEXT: s_mov_b64 s[12:13], exec +; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_cond.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_cond.ll index 25bad218926f3..8612b95b9b44b 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_cond.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_cond.ll @@ -35,12 +35,13 @@ define i32 @buffer_fat_ptr_agent_atomic_usub_cond_ret_u32__offset__amdgpu_no_fin ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, v5, v2 ; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v0, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -136,12 +137,13 @@ define i32 @buffer_fat_ptr_agent_atomic_usub_cond_ret_u32__offset__amdgpu_no_rem ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, v5, v2 ; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v0, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -187,12 +189,13 @@ define i32 @buffer_fat_ptr_agent_atomic_usub_cond_ret_u32__offset__amdgpu_no_fin ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, v5, v2 ; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v0, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -243,12 +246,13 @@ define i32 @buffer_fat_ptr_system_atomic_usub_cond_ret_u32__offset__amdgpu_no_fi ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, v5, v2 ; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v0, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv diff --git a/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll b/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll index 96b71cf85c8b2..bfd0c405fe1a6 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll @@ -11,10 +11,10 @@ define amdgpu_kernel void @buffer_ptr_vector_ops(ptr addrspace(1) %somewhere) { ; GISEL-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GISEL-NEXT: v_mov_b32_e32 v5, s5 ; GISEL-NEXT: v_mov_b32_e32 v6, s6 ; GISEL-NEXT: v_mov_b32_e32 v7, s7 @@ -68,9 +68,9 @@ define amdgpu_kernel void @buffer_structs(%fat_buffer_struct %arg, ptr addrspace ; GISEL-NEXT: s_ashr_i32 s7, s6, 31 ; GISEL-NEXT: s_lshl_b64 s[4:5], s[6:7], 5 ; GISEL-NEXT: s_add_u32 s4, s8, s4 -; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v4, s6 ; GISEL-NEXT: s_addc_u32 s5, s9, s5 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index 37f4094806637..62599aa9d7d08 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -274,9 +274,9 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index c407f7645315d..3b6a4e974e9cc 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -71,11 +71,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: s_mov_b32 s32, 0 @@ -90,11 +90,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: s_mov_b32 s32, 0 @@ -109,11 +109,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 1 ; SDAG-NEXT: s_mov_b32 s32, 0 @@ -156,11 +156,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 1 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -182,11 +182,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 1 @@ -205,11 +205,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: v_bfe_i32 v0, v0, 0, 1 @@ -228,11 +228,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: v_bfe_i32 v0, v0, 0, 1 @@ -287,10 +287,10 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: v_bfe_i32 v0, v0, 0, 1 @@ -315,11 +315,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: v_and_b32_e32 v0, 1, v0 @@ -338,11 +338,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: v_and_b32_e32 v0, 1, v0 @@ -361,11 +361,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 @@ -420,10 +420,10 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 @@ -443,11 +443,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x7b ; VI-NEXT: s_mov_b32 s32, 0 @@ -462,11 +462,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0x7b ; CI-NEXT: s_mov_b32 s32, 0 @@ -481,11 +481,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0x7b ; SDAG-NEXT: s_mov_b32 s32, 0 @@ -539,11 +539,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -566,11 +566,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -588,11 +588,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -610,11 +610,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -666,10 +666,10 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -692,11 +692,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -714,11 +714,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -736,11 +736,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -792,10 +792,10 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -814,11 +814,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x7b ; VI-NEXT: s_mov_b32 s32, 0 @@ -833,11 +833,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0x7b ; CI-NEXT: s_mov_b32 s32, 0 @@ -852,11 +852,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0x7b ; SDAG-NEXT: s_mov_b32 s32, 0 @@ -910,11 +910,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -936,11 +936,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -958,11 +958,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -980,11 +980,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1036,10 +1036,10 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1062,11 +1062,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1084,11 +1084,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1106,11 +1106,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1162,10 +1162,10 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1184,11 +1184,11 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 s32, 0 @@ -1203,11 +1203,11 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 s32, 0 @@ -1222,11 +1222,11 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 42 ; SDAG-NEXT: s_mov_b32 s32, 0 @@ -1269,11 +1269,11 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 42 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1291,11 +1291,11 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x7b ; VI-NEXT: v_mov_b32_e32 v1, 0 @@ -1311,11 +1311,11 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0x7b ; CI-NEXT: v_mov_b32_e32 v1, 0 @@ -1331,11 +1331,11 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0x7b ; SDAG-NEXT: v_mov_b32_e32 v1, 0 @@ -1380,12 +1380,12 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x7b ; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1408,10 +1408,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1430,10 +1430,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1452,10 +1452,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; SDAG-NEXT: s_mov_b32 s2, -1 ; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1532,11 +1532,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: v_mov_b32_e32 v1, 2 @@ -1554,11 +1554,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 @@ -1576,11 +1576,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 1 ; SDAG-NEXT: v_mov_b32_e32 v1, 2 @@ -1630,7 +1630,6 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 @@ -1638,6 +1637,7 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; GISEL-NEXT: v_mov_b32_e32 v1, 2 ; GISEL-NEXT: v_mov_b32_e32 v2, 3 ; GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1660,10 +1660,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v4, 1 ; VI-NEXT: v_mov_b32_e32 v5, 2 @@ -1684,10 +1684,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v4, 1 ; CI-NEXT: v_mov_b32_e32 v5, 2 @@ -1708,10 +1708,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; SDAG-NEXT: s_mov_b32 s2, -1 ; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v4, 1 ; SDAG-NEXT: v_mov_b32_e32 v5, 2 @@ -1802,10 +1802,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v4, 1 ; VI-NEXT: v_mov_b32_e32 v5, 2 @@ -1828,10 +1828,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v4, 1 ; CI-NEXT: v_mov_b32_e32 v5, 2 @@ -1854,10 +1854,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; SDAG-NEXT: s_mov_b32 s2, -1 ; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v4, 1 ; SDAG-NEXT: v_mov_b32_e32 v5, 2 @@ -1927,9 +1927,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GISEL-NEXT: v_mov_b32_e32 v3, s3 -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: v_mov_b32_e32 v6, 3 ; GISEL-NEXT: v_mov_b32_e32 v7, 4 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -1949,11 +1949,11 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x4400 ; VI-NEXT: s_mov_b32 s32, 0 @@ -1968,11 +1968,11 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 4.0 ; CI-NEXT: s_mov_b32 s32, 0 @@ -1987,11 +1987,11 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0x4400 ; SDAG-NEXT: s_mov_b32 s32, 0 @@ -2045,11 +2045,11 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x4400 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2067,11 +2067,11 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 4.0 ; VI-NEXT: s_mov_b32 s32, 0 @@ -2086,11 +2086,11 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 4.0 ; CI-NEXT: s_mov_b32 s32, 0 @@ -2105,11 +2105,11 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 4.0 ; SDAG-NEXT: s_mov_b32 s32, 0 @@ -2152,11 +2152,11 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 4.0 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2174,11 +2174,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2194,11 +2194,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1.0 ; CI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2214,11 +2214,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; SDAG-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2263,12 +2263,12 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GISEL-NEXT: v_mov_b32_e32 v1, 2.0 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2286,11 +2286,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2307,11 +2307,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1.0 ; CI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2328,11 +2328,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; SDAG-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2380,13 +2380,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GISEL-NEXT: v_mov_b32_e32 v1, 2.0 ; GISEL-NEXT: v_mov_b32_e32 v2, 4.0 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2404,11 +2404,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2427,11 +2427,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1.0 ; CI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2450,11 +2450,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; SDAG-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2507,7 +2507,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 @@ -2516,6 +2515,7 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; GISEL-NEXT: v_mov_b32_e32 v2, 4.0 ; GISEL-NEXT: v_mov_b32_e32 v3, -1.0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0.5 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2533,11 +2533,11 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0x40100000 @@ -2553,11 +2553,11 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0x40100000 @@ -2573,11 +2573,11 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v1, 0x40100000 @@ -2622,12 +2622,12 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2645,11 +2645,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2667,11 +2667,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2689,11 +2689,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2743,7 +2743,6 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 @@ -2751,6 +2750,7 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; GISEL-NEXT: v_mov_b32_e32 v1, 2.0 ; GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GISEL-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2768,11 +2768,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2792,11 +2792,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2816,11 +2816,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2875,7 +2875,6 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 @@ -2885,6 +2884,7 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; GISEL-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: v_mov_b32_e32 v5, 0x40200000 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2905,11 +2905,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2927,10 +2927,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2949,11 +2949,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; SDAG-NEXT: s_mov_b32 s2, -1 ; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2994,16 +2994,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GISEL-NEXT: s_mov_b32 s38, -1 ; GISEL-NEXT: s_load_dword s8, s[0:1], 0x0 +; GISEL-NEXT: s_mov_b32 s38, -1 ; GISEL-NEXT: s_mov_b32 s39, 0xe00000 ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s8 @@ -3028,10 +3028,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3049,10 +3049,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; CI-NEXT: buffer_load_dwordx2 v[3:4], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3074,10 +3074,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 ; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3124,13 +3124,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; GISEL-NEXT: s_mov_b32 s39, 0xe00000 ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] -; GISEL-NEXT: s_getpc_b64 s[4:5] -; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 -; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3153,10 +3153,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3174,10 +3174,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; CI-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3200,10 +3200,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 ; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3250,13 +3250,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; GISEL-NEXT: s_mov_b32 s39, 0xe00000 ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] -; GISEL-NEXT: s_getpc_b64 s[4:5] -; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 -; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3275,11 +3275,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x20001 ; VI-NEXT: v_mov_b32_e32 v1, 3 @@ -3295,11 +3295,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 @@ -3316,11 +3316,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0x20001 ; SDAG-NEXT: v_mov_b32_e32 v1, 3 @@ -3365,12 +3365,12 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x20001 ; GISEL-NEXT: v_mov_b32_e32 v1, 3 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3388,11 +3388,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; VI-NEXT: v_mov_b32_e32 v1, 0x4400 @@ -3408,11 +3408,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1.0 ; CI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -3429,11 +3429,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; SDAG-NEXT: v_mov_b32_e32 v1, 0x4400 @@ -3479,12 +3479,12 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GISEL-NEXT: v_mov_b32_e32 v1, 0x4400 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3506,10 +3506,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3527,10 +3527,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3553,10 +3553,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 ; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3603,13 +3603,13 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; GISEL-NEXT: s_mov_b32 s39, 0xe00000 ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] -; GISEL-NEXT: s_getpc_b64 s[4:5] -; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 -; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3628,11 +3628,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x20001 ; VI-NEXT: v_mov_b32_e32 v1, 0x40003 @@ -3648,11 +3648,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 @@ -3670,11 +3670,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 0x20001 ; SDAG-NEXT: v_mov_b32_e32 v1, 0x40003 @@ -3720,12 +3720,12 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 0x20001 ; GISEL-NEXT: v_mov_b32_e32 v1, 0x40003 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3746,11 +3746,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3767,11 +3767,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3792,11 +3792,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; SDAG-NEXT: s_mov_b32 s2, -1 ; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3837,16 +3837,16 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GISEL-NEXT: s_mov_b32 s38, -1 ; GISEL-NEXT: s_load_dword s8, s[0:1], 0x0 +; GISEL-NEXT: s_mov_b32 s38, -1 ; GISEL-NEXT: s_mov_b32 s39, 0xe00000 ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s8 @@ -3871,10 +3871,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3892,10 +3892,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3913,10 +3913,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 ; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3963,13 +3963,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; GISEL-NEXT: s_mov_b32 s39, 0xe00000 ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] -; GISEL-NEXT: s_getpc_b64 s[4:5] -; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 -; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -3988,11 +3988,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: v_mov_b32_e32 v1, 2 @@ -4008,11 +4008,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 @@ -4028,11 +4028,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 1 ; SDAG-NEXT: v_mov_b32_e32 v1, 2 @@ -4077,12 +4077,12 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 1 ; GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4100,11 +4100,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 3 ; VI-NEXT: v_mov_b32_e32 v1, 4 @@ -4121,11 +4121,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 3 ; CI-NEXT: v_mov_b32_e32 v1, 4 @@ -4142,11 +4142,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 3 ; SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -4194,13 +4194,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 3 ; GISEL-NEXT: v_mov_b32_e32 v1, 4 ; GISEL-NEXT: v_mov_b32_e32 v2, 5 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4218,11 +4218,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 3 ; VI-NEXT: v_mov_b32_e32 v1, 4 @@ -4240,11 +4240,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 3 ; CI-NEXT: v_mov_b32_e32 v1, 4 @@ -4262,11 +4262,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 3 ; SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -4316,7 +4316,6 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s5 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 @@ -4324,6 +4323,7 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GISEL-NEXT: v_mov_b32_e32 v1, 4 ; GISEL-NEXT: v_mov_b32_e32 v2, 5 ; GISEL-NEXT: v_mov_b32_e32 v3, 6 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4345,10 +4345,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4366,10 +4366,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4387,10 +4387,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 ; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4464,11 +4464,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: v_mov_b32_e32 v1, 2 @@ -4486,11 +4486,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 @@ -4508,11 +4508,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 1 ; SDAG-NEXT: v_mov_b32_e32 v1, 2 @@ -4562,7 +4562,6 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 @@ -4570,6 +4569,7 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; GISEL-NEXT: v_mov_b32_e32 v1, 2 ; GISEL-NEXT: v_mov_b32_e32 v2, 3 ; GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4587,11 +4587,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: v_mov_b32_e32 v1, 2 @@ -4610,11 +4610,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 @@ -4633,11 +4633,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 1 ; SDAG-NEXT: v_mov_b32_e32 v1, 2 @@ -4690,7 +4690,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 @@ -4699,6 +4698,7 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; GISEL-NEXT: v_mov_b32_e32 v2, 3 ; GISEL-NEXT: v_mov_b32_e32 v3, 4 ; GISEL-NEXT: v_mov_b32_e32 v4, 5 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4723,10 +4723,10 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4747,10 +4747,10 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4771,10 +4771,10 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4830,10 +4830,10 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x0 ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 @@ -4862,11 +4862,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: v_mov_b32_e32 v1, 2 @@ -4888,11 +4888,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 @@ -4914,11 +4914,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: v_mov_b32_e32 v0, 1 ; SDAG-NEXT: v_mov_b32_e32 v1, 2 @@ -4978,7 +4978,6 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 @@ -4990,6 +4989,7 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; GISEL-NEXT: v_mov_b32_e32 v5, 6 ; GISEL-NEXT: v_mov_b32_e32 v6, 7 ; GISEL-NEXT: v_mov_b32_e32 v7, 8 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -5016,10 +5016,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -5042,10 +5042,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; CI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -5068,10 +5068,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; SDAG-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; SDAG-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -5131,10 +5131,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0 ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 @@ -5183,12 +5183,12 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[8:9] ; VI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32 @@ -5215,12 +5215,12 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[8:9] ; CI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_waitcnt vmcnt(6) ; CI-NEXT: buffer_store_dword v31, off, s[36:39], s32 @@ -5247,12 +5247,12 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_getpc_b64 s[8:9] ; SDAG-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_waitcnt vmcnt(6) ; SDAG-NEXT: buffer_store_dword v31, off, s[36:39], s32 @@ -5325,11 +5325,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0 ; GISEL-NEXT: s_mov_b32 s55, 0xe00000 ; GISEL-NEXT: s_add_u32 s52, s52, s3 -; GISEL-NEXT: s_addc_u32 s53, s53, 0 ; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_addc_u32 s53, s53, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s23 -; GISEL-NEXT: s_mov_b64 s[0:1], s[52:53] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 @@ -5354,6 +5353,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; GISEL-NEXT: v_mov_b32_e32 v17, s9 ; GISEL-NEXT: v_mov_b32_e32 v18, s10 ; GISEL-NEXT: v_mov_b32_e32 v19, s11 +; GISEL-NEXT: s_mov_b64 s[0:1], s[52:53] ; GISEL-NEXT: s_mov_b64 s[2:3], s[54:55] ; GISEL-NEXT: v_mov_b32_e32 v20, s12 ; GISEL-NEXT: v_mov_b32_e32 v21, s13 @@ -5396,12 +5396,12 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 ; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 @@ -5431,12 +5431,12 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 ; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_waitcnt vmcnt(8) ; CI-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 @@ -5466,12 +5466,12 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; SDAG-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; SDAG-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 ; SDAG-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_waitcnt vmcnt(8) ; SDAG-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 @@ -5558,11 +5558,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GISEL-NEXT: s_addc_u32 s53, s53, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: ; kill: killed $sgpr0_sgpr1 -; GISEL-NEXT: ; kill: killed $sgpr0_sgpr1 ; GISEL-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 ; GISEL-NEXT: v_mov_b32_e32 v0, s23 -; GISEL-NEXT: s_mov_b64 s[0:1], s[52:53] +; GISEL-NEXT: ; kill: killed $sgpr0_sgpr1 +; GISEL-NEXT: ; kill: killed $sgpr0_sgpr1 ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 @@ -5587,6 +5586,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GISEL-NEXT: v_mov_b32_e32 v17, s9 ; GISEL-NEXT: v_mov_b32_e32 v18, s10 ; GISEL-NEXT: v_mov_b32_e32 v19, s11 +; GISEL-NEXT: s_mov_b64 s[0:1], s[52:53] ; GISEL-NEXT: s_mov_b64 s[2:3], s[54:55] ; GISEL-NEXT: v_mov_b32_e32 v20, s12 ; GISEL-NEXT: v_mov_b32_e32 v21, s13 @@ -5615,14 +5615,14 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; VI-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s50, -1 ; VI-NEXT: s_mov_b32 s51, 0xe80000 -; VI-NEXT: s_add_u32 s48, s48, s5 ; VI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24 +; VI-NEXT: s_add_u32 s48, s48, s5 ; VI-NEXT: s_addc_u32 s49, s49, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[48:49] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[48:49] ; VI-NEXT: s_mov_b64 s[2:3], s[50:51] ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 s32, 0 @@ -5639,14 +5639,14 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; CI-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s50, -1 ; CI-NEXT: s_mov_b32 s51, 0xe8f000 -; CI-NEXT: s_add_u32 s48, s48, s5 ; CI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; CI-NEXT: s_add_u32 s48, s48, s5 ; CI-NEXT: s_addc_u32 s49, s49, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[48:49] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[48:49] ; CI-NEXT: s_mov_b64 s[2:3], s[50:51] ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 s32, 0 @@ -5663,14 +5663,14 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; SDAG-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; SDAG-NEXT: s_mov_b32 s50, -1 ; SDAG-NEXT: s_mov_b32 s51, 0xe00000 -; SDAG-NEXT: s_add_u32 s48, s48, s5 ; SDAG-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24 +; SDAG-NEXT: s_add_u32 s48, s48, s5 ; SDAG-NEXT: s_addc_u32 s49, s49, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[48:49] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[48:49] ; SDAG-NEXT: s_mov_b64 s[2:3], s[50:51] ; SDAG-NEXT: v_mov_b32_e32 v0, 42 ; SDAG-NEXT: s_mov_b32 s32, 0 @@ -5724,15 +5724,15 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; GISEL-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GISEL-NEXT: s_mov_b32 s50, -1 ; GISEL-NEXT: s_mov_b32 s51, 0xe00000 -; GISEL-NEXT: s_add_u32 s48, s48, s5 ; GISEL-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24 +; GISEL-NEXT: s_add_u32 s48, s48, s5 ; GISEL-NEXT: s_addc_u32 s49, s49, 0 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[48:49] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 ; GISEL-NEXT: v_mov_b32_e32 v0, 42 +; GISEL-NEXT: s_mov_b64 s[0:1], s[48:49] ; GISEL-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL-NEXT: s_mov_b32 s32, 0 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -5762,10 +5762,10 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -5786,10 +5786,10 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; CI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -5810,10 +5810,10 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; SDAG-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; SDAG-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -5869,10 +5869,10 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GISEL-NEXT: s_add_u32 s36, s36, s3 ; GISEL-NEXT: s_addc_u32 s37, s37, 0 -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[8:9] ; GISEL-NEXT: s_add_u32 s8, s8, external_void_func_struct_i8_i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s9, s9, external_void_func_struct_i8_i32@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s4 @@ -5901,12 +5901,12 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; VI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4 ; VI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 ; VI-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_movk_i32 s32, 0x400 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 @@ -5929,12 +5929,12 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; CI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4 ; CI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 ; CI-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_movk_i32 s32, 0x400 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 @@ -5958,12 +5958,12 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; SDAG-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: buffer_load_dword v1, off, s[36:39], 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_movk_i32 s32, 0x400 ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_waitcnt vmcnt(1) ; SDAG-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 @@ -6047,12 +6047,12 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; GISEL-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:4 -; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_movk_i32 s32, 0x400 +; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_waitcnt vmcnt(1) ; GISEL-NEXT: buffer_store_dword v0, off, s[36:39], s32 @@ -6086,10 +6086,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; VI-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; VI-NEXT: s_movk_i32 s32, 0x800 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 @@ -6124,10 +6124,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; CI-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; CI-NEXT: s_movk_i32 s32, 0x800 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 @@ -6163,10 +6163,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; SDAG-NEXT: buffer_load_dword v1, off, s[36:39], 0 ; SDAG-NEXT: s_movk_i32 s32, 0x800 ; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_waitcnt vmcnt(1) ; SDAG-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 @@ -6300,10 +6300,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; GISEL-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:4 ; GISEL-NEXT: s_movk_i32 s32, 0x800 ; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1] -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_waitcnt vmcnt(1) ; GISEL-NEXT: buffer_store_dword v0, off, s[36:39], s32 @@ -6353,10 +6353,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6395,10 +6395,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -6437,10 +6437,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) @@ -6562,26 +6562,26 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; GISEL-NEXT: s_lshr_b32 s17, s3, 8 ; GISEL-NEXT: s_lshr_b32 s18, s3, 16 ; GISEL-NEXT: s_lshr_b32 s19, s3, 24 -; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v4, s1 -; GISEL-NEXT: v_mov_b32_e32 v8, s2 -; GISEL-NEXT: v_mov_b32_e32 v12, s3 -; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 ; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s8 ; GISEL-NEXT: v_mov_b32_e32 v2, s9 ; GISEL-NEXT: v_mov_b32_e32 v3, s10 +; GISEL-NEXT: v_mov_b32_e32 v4, s1 ; GISEL-NEXT: v_mov_b32_e32 v5, s11 ; GISEL-NEXT: v_mov_b32_e32 v6, s12 ; GISEL-NEXT: v_mov_b32_e32 v7, s13 +; GISEL-NEXT: v_mov_b32_e32 v8, s2 ; GISEL-NEXT: v_mov_b32_e32 v9, s14 ; GISEL-NEXT: v_mov_b32_e32 v10, s15 ; GISEL-NEXT: v_mov_b32_e32 v11, s16 +; GISEL-NEXT: v_mov_b32_e32 v12, s3 ; GISEL-NEXT: v_mov_b32_e32 v13, s17 ; GISEL-NEXT: v_mov_b32_e32 v14, s18 ; GISEL-NEXT: v_mov_b32_e32 v15, s19 +; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GISEL-NEXT: s_endpgm @@ -6610,12 +6610,12 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 ; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b64 s[0:1], s[52:53] ; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[52:53] ; VI-NEXT: s_mov_b64 s[2:3], s[54:55] ; VI-NEXT: v_mov_b32_e32 v0, s36 ; VI-NEXT: v_mov_b32_e32 v1, s37 @@ -6669,12 +6669,12 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 ; CI-NEXT: v_mov_b32_e32 v0, s5 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b64 s[0:1], s[52:53] ; CI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[52:53] ; CI-NEXT: s_mov_b64 s[2:3], s[54:55] ; CI-NEXT: v_mov_b32_e32 v0, s36 ; CI-NEXT: v_mov_b32_e32 v1, s37 @@ -6728,12 +6728,12 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; SDAG-NEXT: v_mov_b32_e32 v0, s4 ; SDAG-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 ; SDAG-NEXT: v_mov_b32_e32 v0, s5 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], s[52:53] ; SDAG-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 ; SDAG-NEXT: s_getpc_b64 s[4:5] ; SDAG-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 ; SDAG-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], s[52:53] ; SDAG-NEXT: s_mov_b64 s[2:3], s[54:55] ; SDAG-NEXT: v_mov_b32_e32 v0, s36 ; SDAG-NEXT: v_mov_b32_e32 v1, s37 @@ -6883,7 +6883,6 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 ; GISEL-NEXT: v_mov_b32_e32 v0, s1 -; GISEL-NEXT: s_mov_b64 s[0:1], s[52:53] ; GISEL-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 ; GISEL-NEXT: s_getpc_b64 s[4:5] ; GISEL-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 @@ -6908,6 +6907,7 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; GISEL-NEXT: v_mov_b32_e32 v17, s9 ; GISEL-NEXT: v_mov_b32_e32 v18, s10 ; GISEL-NEXT: v_mov_b32_e32 v19, s11 +; GISEL-NEXT: s_mov_b64 s[0:1], s[52:53] ; GISEL-NEXT: s_mov_b64 s[2:3], s[54:55] ; GISEL-NEXT: v_mov_b32_e32 v20, s12 ; GISEL-NEXT: v_mov_b32_e32 v21, s13 diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index ef5438e63f667..644a903138de1 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -245,14 +245,14 @@ define amdgpu_kernel void @call_coldcc() #0 { ; VI-NEXT: s_addc_u32 s5, s5, coldcc@gotpcrel32@hi+12 ; VI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b64 s[10:11], s[6:7] ; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b64 s[10:11], s[6:7] +; VI-NEXT: v_or_b32_e32 v31, v0, v2 ; VI-NEXT: s_mov_b64 s[4:5], s[0:1] ; VI-NEXT: s_mov_b64 s[6:7], s[2:3] ; VI-NEXT: s_mov_b64 s[0:1], s[88:89] -; VI-NEXT: v_or_b32_e32 v31, v0, v2 ; VI-NEXT: s_mov_b64 s[2:3], s[90:91] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: s_mov_b32 s32, 0 @@ -357,14 +357,14 @@ define amdgpu_kernel void @call_fastcc() #0 { ; VI-NEXT: s_addc_u32 s5, s5, fastcc@gotpcrel32@hi+12 ; VI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b64 s[10:11], s[6:7] ; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b64 s[10:11], s[6:7] +; VI-NEXT: v_or_b32_e32 v31, v0, v2 ; VI-NEXT: s_mov_b64 s[4:5], s[0:1] ; VI-NEXT: s_mov_b64 s[6:7], s[2:3] ; VI-NEXT: s_mov_b64 s[0:1], s[88:89] -; VI-NEXT: v_or_b32_e32 v31, v0, v2 ; VI-NEXT: s_mov_b64 s[2:3], s[90:91] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: s_mov_b32 s32, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll index da5e73199a223..98b6fa018a70a 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -54,13 +54,13 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; GFX10-NEXT: s_add_u32 s4, s0, 8 ; GFX10-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-NEXT: s_add_u32 s6, s0, 16 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_addc_u32 s7, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_add_u32 s0, s0, 24 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, s6 ; GFX10-NEXT: v_mov_b32_e32 v5, s7 ; GFX10-NEXT: v_mov_b32_e32 v7, s1 @@ -72,11 +72,11 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; GFX10-NEXT: flat_load_dword v11, v[6:7] ; GFX10-NEXT: s_add_u32 s0, s2, 8 ; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_add_u32 s0, s2, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: s_add_u32 s2, s2, 24 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 @@ -179,13 +179,13 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s4, s0, 8 ; GFX10-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: s_add_u32 s6, s0, 16 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_addc_u32 s7, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_add_u32 s0, s0, 24 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, s6 ; GFX10-NEXT: v_mov_b32_e32 v5, s7 @@ -199,18 +199,18 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; GFX10-NEXT: s_add_u32 s0, s2, 8 ; GFX10-NEXT: s_addc_u32 s1, s3, 0 ; GFX10-NEXT: s_add_u32 s4, s2, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: s_addc_u32 s5, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_add_u32 s0, s2, 24 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 ; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v6 -; GFX10-NEXT: v_mov_b32_e32 v7, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s0 ; GFX10-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) ; GFX10-NEXT: flat_store_dword v[0:1], v8 diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll index a99aab7a23a3b..bf9bae774fd50 100644 --- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll +++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,DAGISEL-ASM ; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,GISEL-ASM + ; Tests that we can avoid nullptr checks for addrspacecasts from/to priv/local. ; ; Whenever a testcase is successful, we should see the addrspacecast replaced with the intrinsic @@ -228,8 +229,8 @@ define void @private_alloca_to_flat(ptr %ptr) { ; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-ASM-NEXT: s_mov_b64 s[4:5], src_private_base ; GISEL-ASM-NEXT: s_lshr_b32 s4, s32, 6 -; GISEL-ASM-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7 +; GISEL-ASM-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2 ; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index ce541dd2954f4..f8771cee537ca 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -1027,10 +1027,7 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-NEXT: s_cbranch_execz .LBB5_1 ; GCN-NEXT: ; %bb.6: ; %bb8 ; GCN-NEXT: ; in Loop: Header=BB5_3 Depth=1 -; GCN-NEXT: v_mov_b32_e32 v5, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: s_branch .LBB5_1 ; GCN-NEXT: .LBB5_7: ; %bb12 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] @@ -1177,10 +1174,13 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 { ; GCN-O0-NEXT: s_mov_b32 s5, s10 ; GCN-O0-NEXT: s_mov_b32 s6, s9 ; GCN-O0-NEXT: s_mov_b32 s7, s8 -; GCN-O0-NEXT: s_waitcnt expcnt(1) +; GCN-O0-NEXT: s_waitcnt expcnt(4) ; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: s_waitcnt expcnt(3) ; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: s_waitcnt expcnt(2) ; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: s_waitcnt expcnt(1) ; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 ; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir b/llvm/test/CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir index 46a72c032827c..4e87905f464f1 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir +++ b/llvm/test/CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir @@ -15,10 +15,10 @@ body: | ; CHECK-LABEL: name: overlapping_copy_kill_undef_reg_after_copy ; CHECK: liveins: $sgpr30_sgpr31, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; CHECK-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 - ; CHECK-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 - ; CHECK-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 + ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5 + ; CHECK-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr6_sgpr7 + ; CHECK-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr8_sgpr9 + ; CHECK-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr10_sgpr11 ; CHECK-NEXT: renamable $sgpr1 = S_ADD_I32 0, $sgpr1, implicit-def $scc ; CHECK-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 @@ -37,8 +37,8 @@ body: | ; CHECK-LABEL: name: nonoverlapping_copy_kill ; CHECK: liveins: $sgpr30_sgpr31, $sgpr4_sgpr5_sgpr6 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr4_sgpr5_sgpr6, implicit-def $sgpr0_sgpr1_sgpr2 - ; CHECK-NEXT: $sgpr2 = S_MOV_B32 $sgpr6, implicit killed $sgpr4_sgpr5_sgpr6 + ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 killed $sgpr4_sgpr5 + ; CHECK-NEXT: $sgpr2 = S_MOV_B32 killed $sgpr6 ; CHECK-NEXT: renamable $sgpr1 = S_ADD_I32 0, $sgpr1, implicit-def $scc ; CHECK-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 renamable $sgpr0_sgpr1_sgpr2 = COPY killed renamable $sgpr4_sgpr5_sgpr6 diff --git a/llvm/test/CodeGen/AMDGPU/copy-overlap-vgpr-kill.mir b/llvm/test/CodeGen/AMDGPU/copy-overlap-vgpr-kill.mir index 5efeb8d40afbb..b18fa12ae782f 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-overlap-vgpr-kill.mir +++ b/llvm/test/CodeGen/AMDGPU/copy-overlap-vgpr-kill.mir @@ -15,9 +15,9 @@ body: | ; CHECK-LABEL: name: overlapping_copy_kill_undef_reg_after_copy ; CHECK: liveins: $sgpr30_sgpr31, $vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr1_vgpr2_vgpr3 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr1_vgpr2_vgpr3 - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec ; CHECK-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F32_e32 0, $vgpr1, implicit $mode, implicit $exec ; CHECK-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 renamable $vgpr0_vgpr1_vgpr2 = COPY killed renamable $vgpr1_vgpr2_vgpr3 @@ -36,9 +36,9 @@ body: | ; CHECK-LABEL: name: overlapping_copy_kill_undef_reg_after_copy_1 ; CHECK: liveins: $sgpr30_sgpr31, $vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr2_vgpr3_vgpr4 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4 - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec ; CHECK-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F32_e32 0, $vgpr1, implicit $mode, implicit $exec ; CHECK-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 renamable $vgpr0_vgpr1_vgpr2 = COPY killed renamable $vgpr2_vgpr3_vgpr4 @@ -57,9 +57,9 @@ body: | ; CHECK-LABEL: name: nonoverlapping_copy_kill ; CHECK: liveins: $sgpr30_sgpr31, $vgpr3_vgpr4_vgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr3_vgpr4_vgpr5 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr3_vgpr4_vgpr5 - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit killed $vgpr3_vgpr4_vgpr5 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr3, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr4, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr5, implicit $exec ; CHECK-NEXT: renamable $vgpr1 = nofpexcept V_MUL_F32_e32 0, $vgpr1, implicit $mode, implicit $exec ; CHECK-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 renamable $vgpr0_vgpr1_vgpr2 = COPY killed renamable $vgpr3_vgpr4_vgpr5 @@ -78,10 +78,10 @@ body: | ; CHECK-LABEL: name: overlapping_copy_kill_half_s128 ; CHECK: liveins: $sgpr30_sgpr31, $vgpr2_vgpr3_vgpr4_vgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec + ; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec ; CHECK-NEXT: renamable $vgpr1 = V_OR_B32_e32 1, $vgpr1, implicit $exec ; CHECK-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index f6cd3d131a627..078dff49f01cf 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -661,8 +661,8 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[0:1] ; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 07e6a76d14cf9..237a0f9dbaccc 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -106,10 +106,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -537,8 +537,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -563,11 +563,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_flbit_i32_b64 s4, s[2:3] ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone @@ -701,12 +701,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_readfirstlane_b32 s2, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -718,7 +719,6 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_cselect_b32 s2, s3, 32 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm @@ -816,16 +816,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] @@ -951,33 +951,33 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: s_add_u32 s4, s2, 5 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_add_u32 s4, s2, 4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_add_u32 s4, s2, 7 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_add_u32 s4, s2, 6 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: v_mov_b32_e32 v6, s4 ; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: flat_load_ubyte v10, v[0:1] ; VI-NEXT: flat_load_ubyte v11, v[2:3] ; VI-NEXT: flat_load_ubyte v12, v[4:5] ; VI-NEXT: flat_load_ubyte v6, v[6:7] ; VI-NEXT: flat_load_ubyte v7, v[8:9] -; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_add_u32 s4, s2, 1 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 @@ -1009,8 +1009,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_min_u32_e32 v0, v0, v4 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_min_u32_e32 v0, 64, v0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1119,11 +1119,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; VI-NEXT: v_ffbh_u32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1201,8 +1201,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b64 s0, s[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1228,11 +1228,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_flbit_i32_b64 s4, s[0:1] ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) @@ -1505,10 +1505,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1584,10 +1584,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1661,10 +1661,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2220,12 +2220,12 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; VI-NEXT: s_lshl_b32 s2, s2, 14 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_flbit_i32_b32 s2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 2 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_bfe_u32 s2, s2, 0x20010 +; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll index 37f5889918c41..77ae8b021417d 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll @@ -351,9 +351,9 @@ define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace( ; SI-NEXT: s_bcnt1_i32_b64 s4, s[6:7] ; SI-NEXT: s_mov_b32 s5, 0 ; SI-NEXT: .LBB7_3: ; %endif -; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -377,10 +377,10 @@ define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: s_bcnt1_i32_b64 s4, s[6:7] ; VI-NEXT: s_mov_b32 s5, 0 ; VI-NEXT: .LBB7_3: ; %endif -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll index d17cdeb8917ff..b95eff1b9feca 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -572,8 +572,8 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], ; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-GISEL-NEXT: s_endpgm %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 137acd34ecc2a..62c0d3820ab53 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -93,10 +93,10 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -514,8 +514,8 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ff1_i32_b64 s2, s[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -540,11 +540,11 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_ff1_i32_b64 s4, s[2:3] ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone @@ -672,8 +672,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v2, v[2:3] @@ -781,16 +781,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] @@ -916,33 +916,33 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: s_add_u32 s4, s2, 5 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_add_u32 s4, s2, 4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_add_u32 s4, s2, 7 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_add_u32 s4, s2, 6 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: v_mov_b32_e32 v6, s4 ; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: flat_load_ubyte v10, v[0:1] ; VI-NEXT: flat_load_ubyte v11, v[2:3] ; VI-NEXT: flat_load_ubyte v12, v[4:5] ; VI-NEXT: flat_load_ubyte v6, v[6:7] ; VI-NEXT: flat_load_ubyte v7, v[8:9] -; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_add_u32 s4, s2, 1 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 @@ -974,8 +974,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: v_ffbl_b32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v0, v4, v0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_min_u32_e32 v0, 64, v0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1094,16 +1094,16 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] @@ -1216,16 +1216,16 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] @@ -1341,16 +1341,16 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] @@ -1456,10 +1456,10 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1541,8 +1541,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v2, v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence.ll index 39af6a05d2725..ad75d1a814955 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence.ll @@ -5,8 +5,8 @@ define amdgpu_kernel void @private_load_maybe_divergent(ptr addrspace(4) %k, ptr ; GCN-LABEL: private_load_maybe_divergent: ; GCN: ; %bb.0: ; GCN-NEXT: s_add_i32 s12, s12, s17 -; GCN-NEXT: s_mov_b64 s[22:23], s[2:3] ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b64 s[22:23], s[2:3] ; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_add_u32 s20, s20, s17 ; GCN-NEXT: s_addc_u32 s21, s21, 0 diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index dd9a013d37203..b1eecfedbd442 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -185,9 +185,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v15, v23, v25 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] ; GFX9-NEXT: v_and_b32_e32 v6, 1, v30 -; GFX9-NEXT: v_mov_b32_e32 v15, v7 ; GFX9-NEXT: v_or3_b32 v3, v3, 0, v13 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v15, v7 ; GFX9-NEXT: v_mov_b32_e32 v14, v6 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB0_3 @@ -1267,16 +1267,16 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc ; GFX9-G-NEXT: v_add_co_u32_e32 v24, vcc, -1, v18 +; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v19, vcc ; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v20 -; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v19, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v4, vcc ; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v0, v8, s[4:5] ; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v1, v9, s[4:5] -; GFX9-G-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v4, vcc -; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-G-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v5, vcc +; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-G-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-G-NEXT: v_mov_b32_e32 v3, s11 @@ -1309,14 +1309,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v22, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc -; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13] ; GFX9-G-NEXT: v_or_b32_e32 v0, v20, v22 ; GFX9-G-NEXT: v_or_b32_e32 v1, v21, v23 +; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13] ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v8 ; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v28 -; GFX9-G-NEXT: v_mov_b32_e32 v0, v8 ; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v0, v8 ; GFX9-G-NEXT: v_mov_b32_e32 v1, v9 ; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX9-G-NEXT: s_cbranch_execnz .LBB0_3 @@ -1375,8 +1375,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], 0x7f ; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr3_vgpr4 killed $exec -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec ; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v1 @@ -1660,11 +1661,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 @@ -1768,8 +1771,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(18) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(19) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(18) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v5 @@ -1813,11 +1817,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v15, v2, v3 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v27 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v28 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v29 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v30 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2 @@ -2197,11 +2203,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v8 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v9 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v10 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v11 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2 @@ -2382,9 +2390,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 ; GFX9-NEXT: v_and_b32_e32 v12, 1, v26 -; GFX9-NEXT: v_mov_b32_e32 v17, v13 ; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v17, v13 ; GFX9-NEXT: v_mov_b32_e32 v16, v12 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB1_3 @@ -2431,8 +2439,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -3306,17 +3314,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v11, v9, v11 ; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], v14, v[0:1] ; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v15, 0, v13, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GFX9-G-NEXT: v_mov_b32_e32 v13, s11 ; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v8, v2, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v9, v3, vcc +; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v11, s9 ; GFX9-G-NEXT: v_mov_b32_e32 v10, s8 +; GFX9-G-NEXT: v_mov_b32_e32 v13, s11 ; GFX9-G-NEXT: v_mov_b32_e32 v12, s10 ; GFX9-G-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX9-G-NEXT: s_xor_b64 s[12:13], exec, s[8:9] @@ -3331,23 +3339,23 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v12 ; GFX9-G-NEXT: v_or_b32_e32 v11, v11, v13 ; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-G-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc ; GFX9-G-NEXT: v_add_co_u32_e32 v22, vcc, -1, v4 ; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v5, vcc -; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 ; GFX9-G-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v6, vcc -; GFX9-G-NEXT: v_mov_b32_e32 v13, s11 ; GFX9-G-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; GFX9-G-NEXT: v_cndmask_b32_e64 v3, v3, v1, s[4:5] ; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v7, vcc +; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-G-NEXT: v_mov_b32_e32 v11, s9 ; GFX9-G-NEXT: v_mov_b32_e32 v10, s8 +; GFX9-G-NEXT: v_mov_b32_e32 v13, s11 ; GFX9-G-NEXT: v_mov_b32_e32 v12, s10 ; GFX9-G-NEXT: .LBB1_3: ; %udiv-do-while ; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3384,8 +3392,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v0 ; GFX9-G-NEXT: v_and_b32_e32 v0, 1, v12 -; GFX9-G-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-G-NEXT: v_mov_b32_e32 v10, v0 ; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX9-G-NEXT: s_cbranch_execnz .LBB1_3 @@ -3686,11 +3694,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 @@ -3794,8 +3804,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(18) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(19) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(18) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v5 @@ -3839,11 +3850,13 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v13, v2, v3 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(11) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v28 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(10) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v29 -; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(9) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v30 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v31 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index 9f1b55ea3b1ef..4e54ad43ff2a9 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -1333,28 +1333,28 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac ; CI-NEXT: s_mov_b32 s14, s10 ; CI-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_mov_b32 s12, s8 +; CI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s48, s48, s11 ; CI-NEXT: s_mov_b64 s[10:11], s[6:7] ; CI-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 ; CI-NEXT: s_load_dword s6, s[4:5], 0x2 ; CI-NEXT: s_addc_u32 s49, s49, 0 +; CI-NEXT: s_mov_b32 s12, s8 ; CI-NEXT: s_add_u32 s8, s4, 12 -; CI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; CI-NEXT: s_mov_b32 s13, s9 +; CI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_i32_e32 v40, vcc, s6, v3 ; CI-NEXT: ds_read_b32 v41, v40 -; CI-NEXT: s_addc_u32 s9, s5, 0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 -; CI-NEXT: s_mov_b64 s[4:5], s[0:1] -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] -; CI-NEXT: s_mov_b64 s[0:1], s[48:49] +; CI-NEXT: s_mov_b32 s13, s9 +; CI-NEXT: s_addc_u32 s9, s5, 0 ; CI-NEXT: s_mov_b32 s17, void_func_void@abs32@hi ; CI-NEXT: s_mov_b32 s16, void_func_void@abs32@lo ; CI-NEXT: v_or_b32_e32 v31, v0, v2 +; CI-NEXT: s_mov_b64 s[4:5], s[0:1] +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_mov_b64 s[0:1], s[48:49] ; CI-NEXT: s_mov_b64 s[2:3], s[50:51] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_mov_b32 s39, 0xf000 @@ -1373,8 +1373,8 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0 ; GFX9-NEXT: s_mov_b32 s14, s10 ; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] @@ -1382,17 +1382,17 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_add_u32 s8, s4, 12 -; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s6 ; GFX9-NEXT: ds_read_b32 v42, v41 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_addc_u32 s9, s5, 0 ; GFX9-NEXT: s_mov_b32 s17, void_func_void@abs32@hi ; GFX9-NEXT: s_mov_b32 s16, void_func_void@abs32@lo ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: v_mov_b32_e32 v40, 0 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index 1684437eff580..313d138b7f64f 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -1107,8 +1107,8 @@ define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { ; CI-LABEL: store_misaligned64_constant_large_offsets: ; CI: ; %bb.0: ; CI-NEXT: s_mov_b64 s[0:1], 0x7b -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v2, 0 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_write_b64 v2, v[0:1] offset:16384 @@ -1118,8 +1118,8 @@ define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { ; GFX9-LABEL: store_misaligned64_constant_large_offsets: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b64 s[0:1], 0x7b -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:16384 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:32760 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll index 683887b0a55f3..e2bfb61783711 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll @@ -1024,10 +1024,10 @@ define void @ds_write2_b64_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GCN-NEXT: v_accvgpr_write_b32 a21, v31 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 ; GCN-NEXT: v_accvgpr_read_b32 v5, a5 ; GCN-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:10 offset1:24 ; GCN-NEXT: v_accvgpr_write_b32 a31, v21 ; Reload Reuse diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll index 5c91ee3f7e748..ffd4f91c0d265 100644 --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -4,9 +4,9 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5) { ; CHECK-LABEL: cannot_create_empty_or_backwards_segment: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_mov_b64 s[26:27], s[2:3] ; CHECK-NEXT: s_mov_b64 s[24:25], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-NEXT: s_mov_b64 s[26:27], s[2:3] ; CHECK-NEXT: s_add_u32 s24, s24, s17 ; CHECK-NEXT: s_addc_u32 s25, s25, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll index c69b0cce3d208..2af8f0ba81584 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -66,9 +66,9 @@ define amdgpu_kernel void @double4_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_cmp_eq_u32 s2, 3 ; GCN-NEXT: s_cselect_b32 s2, 0x40100a3d, s3 ; GCN-NEXT: s_cselect_b32 s3, 0x70a3d70a, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm @@ -103,9 +103,9 @@ define amdgpu_kernel void @double5_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GCN-NEXT: s_cselect_b32 s2, 0x70a3d70a, s8 -; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm @@ -144,8 +144,8 @@ define amdgpu_kernel void @float2_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s2, 1 ; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[2:3] +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -166,9 +166,9 @@ define amdgpu_kernel void @double2_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_cmp_eq_u32 s2, 1 ; GCN-NEXT: s_cselect_b32 s2, s3, 0x3f847ae1 ; GCN-NEXT: s_cselect_b32 s3, s4, 0x47ae147b -; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm @@ -285,15 +285,15 @@ define amdgpu_kernel void @double8_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GCN-NEXT: s_load_dword s18, s[4:5], 0x2c -; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: s_mov_b32 s15, 0x40200000 +; GCN-NEXT: s_mov_b32 s1, 0x3ff00000 +; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: s_mov_b32 s13, 0x401c0000 ; GCN-NEXT: s_mov_b32 s11, 0x40180000 ; GCN-NEXT: s_mov_b32 s9, 0x40140000 ; GCN-NEXT: s_mov_b32 s7, 0x40100000 ; GCN-NEXT: s_mov_b32 s5, 0x40080000 ; GCN-NEXT: s_mov_b32 s3, 2.0 -; GCN-NEXT: s_mov_b32 s1, 0x3ff00000 ; GCN-NEXT: s_mov_b32 s2, s0 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s6, s0 @@ -336,6 +336,7 @@ define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x24 ; GCN-NEXT: s_load_dword s16, s[4:5], 0x2c +; GCN-NEXT: s_mov_b32 s1, 0x3ff00000 ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: s_mov_b32 s13, 0x401c0000 ; GCN-NEXT: s_mov_b32 s11, 0x40180000 @@ -343,17 +344,15 @@ define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_mov_b32 s7, 0x40100000 ; GCN-NEXT: s_mov_b32 s5, 0x40080000 ; GCN-NEXT: s_mov_b32 s3, 2.0 -; GCN-NEXT: s_mov_b32 s1, 0x3ff00000 ; GCN-NEXT: s_mov_b32 s2, s0 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s6, s0 ; GCN-NEXT: s_mov_b32 s8, s0 ; GCN-NEXT: s_mov_b32 s10, s0 ; GCN-NEXT: s_mov_b32 s12, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v15, s15 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 m0, s16, 1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -367,7 +366,6 @@ define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v11, s11 ; GCN-NEXT: v_mov_b32_e32 v12, s12 ; GCN-NEXT: v_mov_b32_e32 v13, s13 -; GCN-NEXT: v_mov_b32_e32 v14, s14 ; GCN-NEXT: v_movrels_b32_e32 v16, v1 ; GCN-NEXT: v_movrels_b32_e32 v15, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s14 @@ -419,6 +417,7 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_mov_b32 s37, 0x3ff00000 ; GCN-NEXT: s_mov_b32 s36, 0 ; GCN-NEXT: s_mov_b32 s65, 0x402e0000 ; GCN-NEXT: s_mov_b32 s63, 0x402c0000 @@ -434,7 +433,6 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_mov_b32 s43, 0x40100000 ; GCN-NEXT: s_mov_b32 s41, 0x40080000 ; GCN-NEXT: s_mov_b32 s39, 2.0 -; GCN-NEXT: s_mov_b32 s37, 0x3ff00000 ; GCN-NEXT: s_mov_b32 s38, s36 ; GCN-NEXT: s_mov_b32 s40, s36 ; GCN-NEXT: s_mov_b32 s42, s36 @@ -451,7 +449,6 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_mov_b32 s64, s36 ; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NEXT: v_mov_b32_e32 v31, s67 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 m0, s2, 1 ; GCN-NEXT: v_mov_b32_e32 v2, s38 @@ -482,7 +479,6 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: v_mov_b32_e32 v27, s63 ; GCN-NEXT: v_mov_b32_e32 v28, s64 ; GCN-NEXT: v_mov_b32_e32 v29, s65 -; GCN-NEXT: v_mov_b32_e32 v30, s66 ; GCN-NEXT: v_movrels_b32_e32 v32, v1 ; GCN-NEXT: v_movrels_b32_e32 v31, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -500,8 +496,9 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: s_mov_b32 s36, 0 ; GCN-NEXT: s_mov_b32 s67, 0x40300000 +; GCN-NEXT: s_mov_b32 s37, 0x3ff00000 +; GCN-NEXT: s_mov_b32 s36, 0 ; GCN-NEXT: s_mov_b32 s65, 0x402e0000 ; GCN-NEXT: s_mov_b32 s63, 0x402c0000 ; GCN-NEXT: s_mov_b32 s61, 0x402a0000 @@ -516,7 +513,6 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_mov_b32 s43, 0x40100000 ; GCN-NEXT: s_mov_b32 s41, 0x40080000 ; GCN-NEXT: s_mov_b32 s39, 2.0 -; GCN-NEXT: s_mov_b32 s37, 0x3ff00000 ; GCN-NEXT: s_mov_b32 s38, s36 ; GCN-NEXT: s_mov_b32 s40, s36 ; GCN-NEXT: s_mov_b32 s42, s36 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll index fac9f5bf826a6..16300185a4b7b 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll @@ -69,8 +69,8 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -123,8 +123,8 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -177,8 +177,8 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -274,8 +274,8 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -371,8 +371,8 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -427,8 +427,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out ; VI-NEXT: s_lshl_b32 s2, s2, 3 ; VI-NEXT: s_and_b32 s3, s3, 0xffff ; VI-NEXT: s_lshr_b32 s2, s3, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -468,8 +468,8 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s2, s2, 3 ; VI-NEXT: s_lshr_b32 s2, s3, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll index c46fcde739b1c..ba63da85e75e7 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll @@ -220,9 +220,9 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -246,9 +246,9 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in ; VI-NEXT: s_lshl_b32 s2, s2, 16 ; VI-NEXT: s_or_b32 s3, s4, s3 ; VI-NEXT: s_or_b32 s2, s5, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 27cf49aec8229..614612de3ee5b 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -220,9 +220,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -236,9 +236,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll index 97e23fcdb2263..233936988017f 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -116,9 +116,9 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset0_b32 s3, 31 ; VI-NEXT: s_bitset0_b32 s2, 31 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index 52bcaed7ec75a..5259d20664d3f 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -79,13 +79,13 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-ALIGNED-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-ALIGNED-NEXT: flat_store_short v[0:1], v2 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-ALIGNED-NEXT: flat_store_short v[0:1], v2 ; GFX7-ALIGNED-NEXT: s_endpgm @@ -228,23 +228,23 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-ALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-ALIGNED-NEXT: s_add_u32 s4, s0, 1 -; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2 ; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0 +; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3 -; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3 ; GFX7-ALIGNED-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2 ; GFX7-ALIGNED-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index d32b528d13276..932987b321042 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -613,9 +613,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 -; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 ; GFX678-NEXT: flat_store_dword v[0:1], v2 ; GFX678-NEXT: s_endpgm @@ -660,9 +660,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 -; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 ; GFX678-NEXT: flat_store_dword v[0:1], v2 ; GFX678-NEXT: s_endpgm @@ -707,9 +707,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 -; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 ; GFX678-NEXT: flat_store_dword v[0:1], v2 ; GFX678-NEXT: s_endpgm @@ -1464,9 +1464,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1512,9 +1512,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1558,9 +1558,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1602,9 +1602,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1646,9 +1646,9 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1690,9 +1690,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; GFX678-NEXT: v_mov_b32_e32 v0, 0 ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1738,9 +1738,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1785,9 +1785,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1831,9 +1831,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1878,9 +1878,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1922,9 +1922,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1966,9 +1966,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2010,9 +2010,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2054,9 +2054,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2098,9 +2098,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2142,9 +2142,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll index 8b5c34d97e50e..f5bc295ba6b85 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -34,9 +34,9 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_bfi_b32 v1, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -80,9 +80,9 @@ define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32 ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset0_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -124,9 +124,9 @@ define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32 ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset0_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -168,9 +168,9 @@ define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i3 ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset0_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -212,9 +212,9 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -256,9 +256,9 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -306,9 +306,9 @@ define amdgpu_kernel void @s_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_bfi_b32 v1, s4, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -359,9 +359,9 @@ define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_bfi_b32 v1, s4, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -634,10 +634,10 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou ; VI-NEXT: v_mov_b32_e32 v2, s9 ; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s13 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -704,13 +704,13 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou ; VI-NEXT: v_bfi_b32 v5, s2, v0, v2 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -787,14 +787,14 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou ; VI-NEXT: v_bfi_b32 v5, s2, v0, v2 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v9, s3 ; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v6, s14 +; VI-NEXT: v_mov_b32_e32 v9, s3 ; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll index 9e527cf38e7ee..e044fbfcd488b 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -1724,11 +1724,11 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, 0.5, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1800,11 +1800,11 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, 0x2e66, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1876,11 +1876,11 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v2, 0xae66, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index c510c40c8536c..e618a723b7fc9 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -540,9 +540,9 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -601,9 +601,9 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -662,9 +662,9 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -886,9 +886,9 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1282,10 +1282,10 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_rcp_f32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -1358,10 +1358,10 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_rcp_f32_e32 v0, s3 ; GFX8-NEXT: v_rcp_f32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -2157,10 +2157,10 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac ; GFX8-NEXT: v_fma_f32 v0, -v0, v3, v1 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3 -; GFX8-NEXT: v_div_fixup_f32 v2, v0, s2, 1.0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_div_fixup_f32 v2, v0, s2, 1.0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2307,10 +2307,10 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr ; GFX8-NEXT: v_fma_f32 v3, v4, v2, v3 ; GFX8-NEXT: v_fma_f32 v0, -v0, v3, v1 ; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3 -; GFX8-NEXT: v_div_fixup_f32 v2, v0, s2, 1.0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_div_fixup_f32 v2, v0, s2, 1.0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll index 607ed85274e40..4a5fb7d4a511f 100644 --- a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll +++ b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll @@ -9,17 +9,17 @@ define amdgpu_kernel void @same_address_fence_merge_write2() #0 { ; GCN-LABEL: same_address_fence_merge_write2: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GCN-NEXT: s_mov_b32 s1, 0x40100000 +; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_add_u32_e32 v3, 0x800, v2 +; GCN-NEXT: s_mov_b32 s1, 0x3ff00000 ; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:66 ; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198 ; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset0:8 offset1:74 ; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset0:140 offset1:206 -; GCN-NEXT: s_mov_b32 s1, 0x3ff00000 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index b2e9831d6c84f..d0df75a4a4a4e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -22,8 +22,8 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX9-NEXT: s_mov_b32 s1, s0 ; GFX9-NEXT: s_mov_b32 s2, s0 ; GFX9-NEXT: s_mov_b32 s3, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -97,8 +97,8 @@ define amdgpu_kernel void @zero_init_kernel() { ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll index b35f07002a48a..9e6ac75a2ee3d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -5603,8 +5603,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5618,8 +5618,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5631,8 +5631,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5666,8 +5666,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5685,8 +5685,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5702,8 +5702,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5750,8 +5750,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; GCN1-NEXT: s_addc_u32 s1, s3, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5771,8 +5771,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; GCN2-NEXT: s_addc_u32 s1, s3, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5790,8 +5790,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: s_add_u32 s0, s2, s0 ; GCN3-NEXT: s_addc_u32 s1, s3, s1 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5839,8 +5839,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; GCN1-NEXT: s_addc_u32 s1, s1, s5 ; GCN1-NEXT: s_add_u32 s0, s0, 16 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5863,8 +5863,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; GCN2-NEXT: s_addc_u32 s1, s1, s5 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5885,8 +5885,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; GCN3-NEXT: v_mov_b32_e32 v0, s8 ; GCN3-NEXT: s_add_u32 s0, s0, s4 ; GCN3-NEXT: s_addc_u32 s1, s1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v1, s9 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5932,8 +5932,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5945,8 +5945,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5958,8 +5958,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5990,8 +5990,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6007,8 +6007,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6024,8 +6024,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6069,8 +6069,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_add_u32 s0, s2, s0 ; GCN1-NEXT: s_addc_u32 s1, s3, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6088,8 +6088,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_add_u32 s0, s2, s0 ; GCN2-NEXT: s_addc_u32 s1, s3, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6107,8 +6107,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: s_add_u32 s0, s2, s0 ; GCN3-NEXT: s_addc_u32 s1, s3, s1 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6153,8 +6153,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; GCN1-NEXT: v_mov_b32_e32 v0, s8 ; GCN1-NEXT: s_add_u32 s0, s0, s4 ; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6175,8 +6175,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; GCN2-NEXT: v_mov_b32_e32 v0, s8 ; GCN2-NEXT: s_add_u32 s0, s0, s4 ; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6197,8 +6197,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; GCN3-NEXT: v_mov_b32_e32 v0, s8 ; GCN3-NEXT: s_add_u32 s0, s0, s4 ; GCN3-NEXT: s_addc_u32 s1, s1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v1, s9 +; GCN3-NEXT: v_mov_b32_e32 v3, s1 ; GCN3-NEXT: v_mov_b32_e32 v2, s0 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8797,7 +8797,7 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 ; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, s2 ; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, s3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 ; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 offset:16 ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 278964334b711..e4b69e045d609 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -1497,8 +1497,8 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB36_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1522,8 +1522,8 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB36_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1547,8 +1547,8 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB36_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1626,8 +1626,8 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB37_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2159,8 +2159,8 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB46_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2184,8 +2184,8 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB46_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2209,8 +2209,8 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB46_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2288,8 +2288,8 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB47_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2839,8 +2839,8 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2865,8 +2865,8 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2891,8 +2891,8 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2973,8 +2973,8 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3592,8 +3592,8 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg % ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB66_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3617,8 +3617,8 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg % ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB66_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3642,8 +3642,8 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg % ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB66_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3721,8 +3721,8 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB67_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4254,8 +4254,8 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB76_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4279,8 +4279,8 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB76_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4304,8 +4304,8 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB76_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4383,8 +4383,8 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB77_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4916,8 +4916,8 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4941,8 +4941,8 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4966,8 +4966,8 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5045,8 +5045,8 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5980,8 +5980,8 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6005,8 +6005,8 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6030,8 +6030,8 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6109,8 +6109,8 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6954,8 +6954,8 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6979,8 +6979,8 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7004,8 +7004,8 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7083,8 +7083,8 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7616,8 +7616,8 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7641,8 +7641,8 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7666,8 +7666,8 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7745,8 +7745,8 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8703,8 +8703,8 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB137_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8730,8 +8730,8 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB137_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8757,8 +8757,8 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB137_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8842,8 +8842,8 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB138_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9262,9 +9262,9 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v1, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: .LBB145_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9291,9 +9291,9 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v1, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: .LBB145_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9320,9 +9320,9 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v1, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: .LBB145_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9411,9 +9411,9 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: .LBB146_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9445,9 +9445,9 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[36:37], 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 ; GCN1-NEXT: .LBB147_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9474,9 +9474,9 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[36:37], 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 ; GCN2-NEXT: .LBB147_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9503,9 +9503,9 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v3, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB147_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9594,9 +9594,9 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[36:37], 0 ; GCN3-NEXT: v_mov_b32_e32 v3, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 ; GCN3-NEXT: .LBB148_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index 25fbdbc83b2b9..c17b82a036fc0 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -29,8 +29,8 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB0_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -79,8 +79,8 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB0_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -164,8 +164,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB1_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -216,8 +216,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB1_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -320,8 +320,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB2_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -374,8 +374,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB2_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -467,8 +467,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB3_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -521,8 +521,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB3_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -621,8 +621,8 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB4_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -669,8 +669,8 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB4_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -750,8 +750,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB5_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -800,8 +800,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB5_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -900,8 +900,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB6_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -952,8 +952,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB6_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1041,8 +1041,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB7_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1093,8 +1093,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB7_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1193,8 +1193,8 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB8_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1242,8 +1242,8 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB8_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1325,8 +1325,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB9_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1376,8 +1376,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB9_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1478,8 +1478,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB10_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1531,8 +1531,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB10_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1622,8 +1622,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB11_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1675,8 +1675,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB11_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1773,8 +1773,8 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB12_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1820,8 +1820,8 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB12_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1899,8 +1899,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB13_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1948,8 +1948,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB13_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2046,8 +2046,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB14_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2097,8 +2097,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB14_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2184,8 +2184,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB15_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2235,8 +2235,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB15_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2333,8 +2333,8 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB16_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2383,8 +2383,8 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB16_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2468,8 +2468,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB17_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2520,8 +2520,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB17_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2624,8 +2624,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB18_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2678,8 +2678,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB18_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2771,8 +2771,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB19_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2825,8 +2825,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB19_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2925,8 +2925,8 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB20_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2973,8 +2973,8 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB20_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3054,8 +3054,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB21_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3104,8 +3104,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB21_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3204,8 +3204,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB22_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3256,8 +3256,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB22_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3345,8 +3345,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB23_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3397,8 +3397,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB23_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3497,8 +3497,8 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB24_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -3547,8 +3547,8 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB24_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -3632,8 +3632,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB25_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -3685,8 +3685,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB25_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -3790,8 +3790,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB26_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -3844,8 +3844,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB26_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -3937,8 +3937,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB27_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -3992,8 +3992,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB27_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -4093,8 +4093,8 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB28_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -4141,8 +4141,8 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB28_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -4222,8 +4222,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB29_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -4273,8 +4273,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB29_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -4374,8 +4374,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB30_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -4426,8 +4426,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB30_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -4515,8 +4515,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB31_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -4568,8 +4568,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB31_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -4669,8 +4669,8 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB32_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -4719,8 +4719,8 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB32_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -4804,8 +4804,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_cbranch_vccz .LBB33_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -4857,8 +4857,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_cbranch_vccz .LBB33_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -4962,8 +4962,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB34_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -5016,8 +5016,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB34_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -5109,8 +5109,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_cbranch_vccz .LBB35_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -5164,8 +5164,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_cbranch_vccz .LBB35_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -5265,8 +5265,8 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB36_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -5313,8 +5313,8 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB36_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -5394,8 +5394,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB37_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -5445,8 +5445,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB37_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -5546,8 +5546,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB38_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -5598,8 +5598,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB38_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -5687,8 +5687,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_cbranch_vccz .LBB39_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -5740,8 +5740,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_cbranch_vccz .LBB39_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -5841,8 +5841,8 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB40_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -5891,8 +5891,8 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB40_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -5976,8 +5976,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB41_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -6029,8 +6029,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB41_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -6134,8 +6134,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB42_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -6188,8 +6188,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB42_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -6281,8 +6281,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB43_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -6336,8 +6336,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB43_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -6437,8 +6437,8 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB44_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -6485,8 +6485,8 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB44_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -6566,8 +6566,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB45_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -6617,8 +6617,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB45_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -6718,8 +6718,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB46_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -6770,8 +6770,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB46_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -6859,8 +6859,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB47_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -6912,8 +6912,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB47_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -7013,8 +7013,8 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB48_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -7063,8 +7063,8 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB48_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -7148,8 +7148,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_cbranch_vccz .LBB49_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -7201,8 +7201,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_cbranch_vccz .LBB49_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -7306,8 +7306,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB50_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -7360,8 +7360,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB50_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -7453,8 +7453,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_cbranch_vccz .LBB51_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -7508,8 +7508,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_cbranch_vccz .LBB51_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -7609,8 +7609,8 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB52_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -7657,8 +7657,8 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB52_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -7738,8 +7738,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB53_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -7789,8 +7789,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB53_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -7890,8 +7890,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB54_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -7942,8 +7942,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB54_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -8031,8 +8031,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_cbranch_vccz .LBB55_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -8084,8 +8084,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_cbranch_vccz .LBB55_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -8185,8 +8185,8 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB56_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8234,8 +8234,8 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB56_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8317,8 +8317,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GCN1-NEXT: s_cbranch_vccz .LBB57_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8368,8 +8368,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GCN2-NEXT: s_cbranch_vccz .LBB57_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8470,8 +8470,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB58_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8523,8 +8523,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB58_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8614,8 +8614,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB59_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8667,8 +8667,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB59_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8765,8 +8765,8 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB60_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8812,8 +8812,8 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB60_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8891,8 +8891,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB61_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8940,8 +8940,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB61_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9038,8 +9038,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB62_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9089,8 +9089,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB62_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9176,8 +9176,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GCN1-NEXT: s_cbranch_vccz .LBB63_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9227,8 +9227,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GCN2-NEXT: s_cbranch_vccz .LBB63_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9325,8 +9325,8 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB64_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9370,8 +9370,8 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB64_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9451,8 +9451,8 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB65_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9496,8 +9496,8 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB65_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9577,8 +9577,8 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB66_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9622,8 +9622,8 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB66_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9698,8 +9698,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_cbranch_vccz .LBB67_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9748,8 +9748,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_cbranch_vccz .LBB67_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9848,8 +9848,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB68_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9897,8 +9897,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB68_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9981,8 +9981,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_cbranch_vccz .LBB69_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10033,8 +10033,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_cbranch_vccz .LBB69_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10129,8 +10129,8 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB70_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10172,8 +10172,8 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB70_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10244,8 +10244,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB71_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10292,8 +10292,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB71_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10388,8 +10388,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB72_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10435,8 +10435,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB72_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10515,8 +10515,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_cbranch_vccz .LBB73_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10565,8 +10565,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_cbranch_vccz .LBB73_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10661,8 +10661,8 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB74_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10710,8 +10710,8 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB74_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10793,8 +10793,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB75_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10844,8 +10844,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB75_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10946,8 +10946,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB76_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10999,8 +10999,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB76_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11090,8 +11090,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB77_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11143,8 +11143,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB77_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11241,8 +11241,8 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB78_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11288,8 +11288,8 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB78_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11367,8 +11367,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB79_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11416,8 +11416,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB79_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11514,8 +11514,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB80_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11565,8 +11565,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB80_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11652,8 +11652,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB81_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11703,8 +11703,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB81_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12213,11 +12213,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GCN1-NEXT: .LBB90_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB90_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12267,11 +12267,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GCN2-NEXT: .LBB90_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB90_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12364,11 +12364,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GCN1-NEXT: .LBB91_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB91_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12418,11 +12418,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GCN2-NEXT: .LBB91_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB91_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12508,11 +12508,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN1-NEXT: s_cbranch_vccz .LBB92_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s13 ; GCN1-NEXT: v_mov_b32_e32 v2, s14 ; GCN1-NEXT: v_mov_b32_e32 v3, s15 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12562,11 +12562,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GCN2-NEXT: s_cbranch_vccz .LBB92_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s13 ; GCN2-NEXT: v_mov_b32_e32 v2, s14 ; GCN2-NEXT: v_mov_b32_e32 v3, s15 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12668,11 +12668,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GCN1-NEXT: .LBB93_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB93_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s10 ; GCN1-NEXT: v_mov_b32_e32 v1, s11 ; GCN1-NEXT: v_mov_b32_e32 v2, s14 ; GCN1-NEXT: v_mov_b32_e32 v3, s15 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12724,11 +12724,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GCN2-NEXT: .LBB93_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB93_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s10 ; GCN2-NEXT: v_mov_b32_e32 v1, s11 ; GCN2-NEXT: v_mov_b32_e32 v2, s14 ; GCN2-NEXT: v_mov_b32_e32 v3, s15 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12820,11 +12820,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB94_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s3 ; GCN1-NEXT: v_mov_b32_e32 v0, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s13 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v5, s3 ; GCN1-NEXT: v_mov_b32_e32 v4, s2 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12878,11 +12878,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB94_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s3 ; GCN2-NEXT: v_mov_b32_e32 v0, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s13 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v5, s3 ; GCN2-NEXT: v_mov_b32_e32 v4, s2 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12986,11 +12986,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GCN1-NEXT: .LBB95_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB95_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13038,11 +13038,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GCN2-NEXT: .LBB95_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB95_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13124,11 +13124,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN1-NEXT: s_cbranch_vccz .LBB96_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v4, s8 ; GCN1-NEXT: v_mov_b32_e32 v0, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s13 ; GCN1-NEXT: v_mov_b32_e32 v2, s14 ; GCN1-NEXT: v_mov_b32_e32 v3, s15 +; GCN1-NEXT: v_mov_b32_e32 v4, s8 ; GCN1-NEXT: v_mov_b32_e32 v5, s9 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13176,11 +13176,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GCN2-NEXT: s_cbranch_vccz .LBB96_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v4, s8 ; GCN2-NEXT: v_mov_b32_e32 v0, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s13 ; GCN2-NEXT: v_mov_b32_e32 v2, s14 ; GCN2-NEXT: v_mov_b32_e32 v3, s15 +; GCN2-NEXT: v_mov_b32_e32 v4, s8 ; GCN2-NEXT: v_mov_b32_e32 v5, s9 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13278,11 +13278,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GCN1-NEXT: .LBB97_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB97_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s10 ; GCN1-NEXT: v_mov_b32_e32 v1, s11 ; GCN1-NEXT: v_mov_b32_e32 v2, s14 ; GCN1-NEXT: v_mov_b32_e32 v3, s15 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13332,11 +13332,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GCN2-NEXT: .LBB97_2: ; %atomicrmw.phi ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB97_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s10 ; GCN2-NEXT: v_mov_b32_e32 v1, s11 ; GCN2-NEXT: v_mov_b32_e32 v2, s14 ; GCN2-NEXT: v_mov_b32_e32 v3, s15 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13424,11 +13424,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN1-NEXT: s_cbranch_vccz .LBB98_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v5, s3 ; GCN1-NEXT: v_mov_b32_e32 v0, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s13 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v5, s3 ; GCN1-NEXT: v_mov_b32_e32 v4, s2 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13480,11 +13480,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN2-NEXT: s_cbranch_vccz .LBB98_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v5, s3 ; GCN2-NEXT: v_mov_b32_e32 v0, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s13 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v5, s3 ; GCN2-NEXT: v_mov_b32_e32 v4, s2 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14000,8 +14000,8 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB107_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14052,8 +14052,8 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB107_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14142,8 +14142,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB108_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14196,8 +14196,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB108_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14305,8 +14305,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB109_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14361,8 +14361,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB109_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14459,8 +14459,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB110_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14515,8 +14515,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB110_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14620,8 +14620,8 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB111_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14670,8 +14670,8 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB111_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14756,8 +14756,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB112_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14808,8 +14808,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB112_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14913,8 +14913,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB113_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14967,8 +14967,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB113_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15061,8 +15061,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB114_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15115,8 +15115,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB114_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15220,8 +15220,8 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB115_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15275,8 +15275,8 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB115_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15371,8 +15371,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB116_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15429,8 +15429,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB116_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15545,8 +15545,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB117_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15604,8 +15604,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB117_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15708,8 +15708,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cbranch_vccz .LBB118_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15768,8 +15768,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cbranch_vccz .LBB118_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15880,8 +15880,8 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB119_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15933,8 +15933,8 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB119_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16025,8 +16025,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_cbranch_vccz .LBB120_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s8 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16081,8 +16081,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_cbranch_vccz .LBB120_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s8 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s9 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16193,8 +16193,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_endpgm ; GCN1-NEXT: .LBB121_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16250,8 +16250,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_endpgm ; GCN2-NEXT: .LBB121_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16350,8 +16350,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cbranch_vccz .LBB122_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s12 ; GCN1-NEXT: v_mov_b32_e32 v3, s13 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16408,8 +16408,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cbranch_vccz .LBB122_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s12 ; GCN2-NEXT: v_mov_b32_e32 v3, s13 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll index e5187a811a230..096f20b91cede 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s + define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GFX7-LABEL: atomic_add_i64_offset: ; GFX7: ; %bb.0: ; %entry @@ -10,9 +11,9 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -25,9 +26,9 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -38,14 +39,15 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -59,8 +61,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -78,8 +80,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -92,19 +94,21 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_add_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -158,15 +162,16 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -221,17 +226,18 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -385,7 +391,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -445,7 +451,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -510,14 +516,15 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -590,19 +597,21 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_and_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -680,15 +689,16 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -769,17 +779,18 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1015,7 +1026,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -1101,7 +1112,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1168,14 +1179,15 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -1196,8 +1208,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7 ; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc @@ -1229,8 +1241,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v3 ; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, v3 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc @@ -1250,19 +1262,21 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_sub_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1342,15 +1356,16 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -1373,8 +1388,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7 ; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc @@ -1408,8 +1423,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v3 ; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, v3 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc @@ -1433,17 +1448,18 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1458,8 +1474,8 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1486,8 +1502,8 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1531,14 +1547,14 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v1 ; GFX7-NEXT: v_mov_b32_e32 v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, v1 ; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7 ; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -1564,14 +1580,14 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -1685,7 +1701,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -1706,8 +1722,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7 ; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc @@ -1739,8 +1755,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v3 ; GFX8-NEXT: v_mov_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, v3 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc @@ -1773,7 +1789,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1842,14 +1858,15 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -1926,19 +1943,21 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_max_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2020,15 +2039,16 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -2113,17 +2133,18 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2138,9 +2159,9 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2167,9 +2188,9 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2213,9 +2234,9 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2247,9 +2268,9 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2371,7 +2392,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -2461,7 +2482,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2530,14 +2551,15 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -2614,19 +2636,21 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umax_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2708,15 +2732,16 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -2801,17 +2826,18 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2826,9 +2852,9 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2855,9 +2881,9 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2901,9 +2927,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2935,9 +2961,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3059,7 +3085,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -3149,7 +3175,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -3218,14 +3244,15 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -3302,19 +3329,21 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_min_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -3396,15 +3425,16 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -3489,17 +3519,18 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -3514,9 +3545,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3543,9 +3574,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3589,9 +3620,9 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3623,9 +3654,9 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3747,7 +3778,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -3837,7 +3868,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -3906,14 +3937,15 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -3990,19 +4022,21 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umin_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -4084,15 +4118,16 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -4177,17 +4212,18 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -4202,9 +4238,9 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4231,9 +4267,9 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4277,9 +4313,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4311,9 +4347,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4435,7 +4471,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -4525,7 +4561,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -4590,14 +4626,15 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -4670,19 +4707,21 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-LABEL: atomic_or_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -4760,15 +4799,16 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -4849,17 +4889,18 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -5095,7 +5136,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5181,7 +5222,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -5194,9 +5235,9 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5209,9 +5250,9 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5222,14 +5263,15 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5241,9 +5283,9 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5256,9 +5298,9 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5269,14 +5311,15 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds double, ptr %out, i64 4 + %gep = getelementptr double, ptr %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, double %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5288,9 +5331,9 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5303,9 +5346,9 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5316,14 +5359,15 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds ptr, ptr %out, i32 4 + %gep = getelementptr ptr, ptr %out, i32 4 %val = atomicrmw volatile xchg ptr %gep, ptr %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5337,8 +5381,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5356,8 +5400,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5370,19 +5414,21 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_xchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -5436,15 +5482,16 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5499,17 +5546,18 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -5663,7 +5711,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5723,7 +5771,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -5788,14 +5836,15 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5868,19 +5917,21 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_xor_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -5958,15 +6009,16 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -6047,17 +6099,18 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -6293,7 +6346,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -6379,7 +6432,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -6422,15 +6475,16 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %in, i64 4 + %gep = getelementptr i64, ptr %in, i64 4 %val = load atomic i64, ptr %gep seq_cst, align 8 store i64 %val, ptr %out ret void @@ -6532,16 +6586,17 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %in, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %in, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %val = load atomic i64, ptr %gep seq_cst, align 8 store i64 %val, ptr %out ret void @@ -6601,7 +6656,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %in, i64 %index + %ptr = getelementptr i64, ptr %in, i64 %index %val = load atomic i64, ptr %ptr seq_cst, align 8 store i64 %val, ptr %out ret void @@ -6638,14 +6693,15 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 32 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 store atomic i64 %in, ptr %gep seq_cst, align 8 ret void } @@ -6732,15 +6788,16 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 32 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 store atomic i64 %in, ptr %gep seq_cst, align 8 ret void } @@ -6792,7 +6849,7 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index store atomic i64 %in, ptr %ptr seq_cst, align 8 ret void } @@ -6805,11 +6862,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6823,11 +6880,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6840,15 +6897,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -6861,11 +6919,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 0x11940 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6879,11 +6937,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 0x11940 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6896,15 +6954,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x11940 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000 scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 9000 + %gep = getelementptr i64, ptr %out, i64 9000 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -6916,11 +6975,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6936,11 +6995,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6954,17 +7013,18 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 @@ -6981,11 +7041,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_add_u32 s0, s0, 32 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7001,11 +7061,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7020,14 +7080,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -7044,10 +7106,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX7-NEXT: s_addc_u32 s3, s9, s3 ; GFX7-NEXT: s_add_u32 s2, s0, 32 ; GFX7-NEXT: s_addc_u32 s3, s3, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-NEXT: v_mov_b32_e32 v0, s12 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7068,10 +7130,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX8-NEXT: s_addc_u32 s3, s9, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: s_addc_u32 s3, s3, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7091,16 +7153,18 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[8:9], s[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 @@ -7223,11 +7287,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX7-NEXT: s_add_u32 s0, s0, s4 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7241,11 +7305,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7266,7 +7330,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -7280,11 +7344,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX7-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 ; GFX7-NEXT: s_add_u32 s2, s8, s2 ; GFX7-NEXT: s_addc_u32 s3, s9, s3 -; GFX7-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-NEXT: v_mov_b32_e32 v0, s12 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7302,11 +7366,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 ; GFX8-NEXT: s_add_u32 s2, s8, s2 ; GFX8-NEXT: s_addc_u32 s3, s9, s3 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7334,7 +7398,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 @@ -7378,15 +7442,16 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds double, ptr %in, i64 4 + %gep = getelementptr double, ptr %in, i64 4 %val = load atomic double, ptr %gep seq_cst, align 8, !noalias.addrspace !0 store double %val, ptr %out ret void @@ -7488,16 +7553,17 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds double, ptr %in, i64 %index - %gep = getelementptr inbounds double, ptr %ptr, i64 4 + %ptr = getelementptr double, ptr %in, i64 %index + %gep = getelementptr double, ptr %ptr, i64 4 %val = load atomic double, ptr %gep seq_cst, align 8, !noalias.addrspace !0 store double %val, ptr %out ret void @@ -7557,7 +7623,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds double, ptr %in, i64 %index + %ptr = getelementptr double, ptr %in, i64 %index %val = load atomic double, ptr %ptr seq_cst, align 8, !noalias.addrspace !0 store double %val, ptr %out ret void @@ -7594,14 +7660,15 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 32 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds double, ptr %out, i64 4 + %gep = getelementptr double, ptr %out, i64 4 store atomic double %in, ptr %gep seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -7688,15 +7755,16 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 32 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds double, ptr %out, i64 %index - %gep = getelementptr inbounds double, ptr %ptr, i64 4 + %ptr = getelementptr double, ptr %out, i64 %index + %gep = getelementptr double, ptr %ptr, i64 4 store atomic double %in, ptr %gep seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -7748,7 +7816,7 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds double, ptr %out, i64 %index + %ptr = getelementptr double, ptr %out, i64 %index store atomic double %in, ptr %ptr seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -7818,14 +7886,15 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -7845,8 +7914,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: .LBB108_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] @@ -7880,8 +7949,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: .LBB108_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] @@ -7904,19 +7973,21 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_inc_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -8000,15 +8071,16 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -8030,8 +8102,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX7-NEXT: .LBB110_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] @@ -8067,8 +8139,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX8-NEXT: .LBB110_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] @@ -8095,17 +8167,18 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -8200,8 +8273,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX7-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] @@ -8233,8 +8306,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX8-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] @@ -8355,7 +8428,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -8375,8 +8448,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX7-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] @@ -8410,8 +8483,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX8-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5] @@ -8447,7 +8520,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -8526,14 +8599,15 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -8620,19 +8694,21 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_dec_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -8724,15 +8800,16 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -8827,17 +8904,18 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -8852,9 +8930,9 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB119_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8886,9 +8964,9 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB119_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9111,7 +9189,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -9211,7 +9289,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 9e27f6badfdac..e39fd817a0bf5 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN2 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN3 %s + ; --------------------------------------------------------------------- ; atomicrmw xchg ; --------------------------------------------------------------------- @@ -478,8 +479,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB4_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -517,8 +518,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB4_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -553,8 +554,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB4_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -595,8 +596,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB5_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -636,8 +637,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB5_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -674,8 +675,8 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB5_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -709,8 +710,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: s_cbranch_vccz .LBB6_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -748,8 +749,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: s_cbranch_vccz .LBB6_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -784,8 +785,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_cbranch_vccz .LBB6_2 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -826,8 +827,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: s_cbranch_vccz .LBB7_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -867,8 +868,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: s_cbranch_vccz .LBB7_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -905,8 +906,8 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: s_cbranch_vccz .LBB7_2 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1638,8 +1639,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB14_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1677,8 +1678,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB14_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1713,8 +1714,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB14_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1755,8 +1756,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB15_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1796,8 +1797,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB15_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1834,8 +1835,8 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB15_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1869,8 +1870,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double ; GCN1-NEXT: s_cbranch_vccz .LBB16_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1908,8 +1909,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double ; GCN2-NEXT: s_cbranch_vccz .LBB16_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1944,8 +1945,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double ; GCN3-NEXT: s_cbranch_vccz .LBB16_2 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1986,8 +1987,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_cbranch_vccz .LBB17_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2027,8 +2028,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_cbranch_vccz .LBB17_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2065,8 +2066,8 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_cbranch_vccz .LBB17_2 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2856,8 +2857,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB24_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2900,8 +2901,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB24_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2941,8 +2942,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB24_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2988,8 +2989,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB25_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3034,8 +3035,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB25_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3077,8 +3078,8 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB25_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3117,8 +3118,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: s_cbranch_vccz .LBB26_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3159,8 +3160,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: s_cbranch_vccz .LBB26_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3198,8 +3199,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_cbranch_vccz .LBB26_2 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3243,8 +3244,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_cbranch_vccz .LBB27_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3287,8 +3288,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_cbranch_vccz .LBB27_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3328,8 +3329,8 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_cbranch_vccz .LBB27_2 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4000,8 +4001,8 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: .LBB32_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 ; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -4058,8 +4059,8 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: .LBB32_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v5 ; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_mov_b32_e32 v7, v5 ; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 ; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -4119,8 +4120,8 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB32_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_sub_co_u32_e32 v6, vcc, v8, v2 ; GCN3-NEXT: v_subb_co_u32_e32 v7, vcc, v9, v3, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc @@ -4184,8 +4185,8 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: .LBB33_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 ; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_mov_b32_e32 v9, v1 ; GCN1-NEXT: v_sub_i32_e32 v6, vcc, v8, v2 ; GCN1-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc @@ -4246,8 +4247,8 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: .LBB33_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 ; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_mov_b32_e32 v9, v1 ; GCN2-NEXT: v_sub_u32_e32 v6, vcc, v8, v2 ; GCN2-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc @@ -4303,8 +4304,8 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB33_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_sub_co_u32_e32 v6, vcc, v8, v2 ; GCN3-NEXT: v_subb_co_u32_e32 v7, vcc, v9, v3, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc @@ -4736,8 +4737,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: .LBB36_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v8, v1 ; GCN1-NEXT: v_mov_b32_e32 v7, v0 +; GCN1-NEXT: v_mov_b32_e32 v8, v1 ; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 ; GCN1-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -4797,8 +4798,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: .LBB36_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v8, v1 ; GCN2-NEXT: v_mov_b32_e32 v7, v0 +; GCN2-NEXT: v_mov_b32_e32 v8, v1 ; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 ; GCN2-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -4850,8 +4851,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: .LBB36_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v8, v1 ; GCN3-NEXT: v_mov_b32_e32 v7, v0 +; GCN3-NEXT: v_mov_b32_e32 v8, v1 ; GCN3-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 ; GCN3-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -4914,8 +4915,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: .LBB37_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v8, v1 ; GCN1-NEXT: v_mov_b32_e32 v7, v0 +; GCN1-NEXT: v_mov_b32_e32 v8, v1 ; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 ; GCN1-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -4977,8 +4978,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: .LBB37_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v8, v1 ; GCN2-NEXT: v_mov_b32_e32 v7, v0 +; GCN2-NEXT: v_mov_b32_e32 v8, v1 ; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 ; GCN2-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -5032,8 +5033,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: .LBB37_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v8, v1 ; GCN3-NEXT: v_mov_b32_e32 v7, v0 +; GCN3-NEXT: v_mov_b32_e32 v8, v1 ; GCN3-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 ; GCN3-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -22039,8 +22040,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: .LBB133_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v6 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] @@ -22102,8 +22103,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: .LBB133_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v5 ; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_mov_b32_e32 v7, v5 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v6 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] @@ -22168,8 +22169,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: .LBB133_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v8 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v9, vcc ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] @@ -22238,8 +22239,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: .LBB134_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v9, v1 ; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_mov_b32_e32 v9, v1 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v8 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] @@ -22305,8 +22306,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: .LBB134_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v9, v1 ; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_mov_b32_e32 v9, v1 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v8 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] @@ -22367,8 +22368,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: .LBB134_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_mov_b32_e32 v9, v1 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v8 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v9, vcc ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] @@ -22828,8 +22829,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN1-NEXT: .LBB137_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v6 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -22893,8 +22894,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN2-NEXT: .LBB137_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v6 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -22950,8 +22951,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: .LBB137_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v1 ; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_mov_b32_e32 v7, v1 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -23018,8 +23019,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: .LBB138_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v6 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -23085,8 +23086,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: .LBB138_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v6 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -23144,8 +23145,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: .LBB138_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v1 ; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_mov_b32_e32 v7, v1 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll index f655d4761fa31..9b26c3f50a4b4 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s + ; --------------------------------------------------------------------- ; atomicrmw xchg ; --------------------------------------------------------------------- @@ -59,11 +60,13 @@ define void @flat_atomic_xchg_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_xchg_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -120,11 +123,13 @@ define i64 @flat_atomic_xchg_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_xchg_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -175,9 +180,9 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -189,9 +194,9 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -201,15 +206,17 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_xchg_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s34 +; GFX9-NEXT: v_mov_b32_e32 v3, s35 +; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -260,9 +267,9 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -274,9 +281,9 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -286,15 +293,17 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-LABEL: flat_atomic_xchg_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v2, s34 +; GFX9-NEXT: v_mov_b32_e32 v3, s35 +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -323,11 +332,13 @@ define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -356,11 +367,13 @@ define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX9-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -421,11 +434,13 @@ define void @flat_atomic_xchg_f64_noret_offset(ptr %out, double %in) { ; GFX9-LABEL: flat_atomic_xchg_f64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds double, ptr %out, i32 4 + %gep = getelementptr double, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 ret void } @@ -482,11 +497,13 @@ define double @flat_atomic_xchg_f64_ret_offset(ptr %out, double %in) { ; GFX9-LABEL: flat_atomic_xchg_f64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds double, ptr %out, i32 4 + %gep = getelementptr double, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 ret double %result } @@ -537,9 +554,9 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -551,9 +568,9 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -563,15 +580,17 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_xchg_f64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s34 +; GFX9-NEXT: v_mov_b32_e32 v3, s35 +; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds double, ptr %out, i32 4 + %gep = getelementptr double, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 ret void } @@ -622,9 +641,9 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -636,9 +655,9 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -648,15 +667,17 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_xchg_f64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v2, s34 +; GFX9-NEXT: v_mov_b32_e32 v3, s35 +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds double, ptr %out, i32 4 + %gep = getelementptr double, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 ret double %result } @@ -685,11 +706,13 @@ define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds double, ptr %out, i64 4 + %gep = getelementptr double, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -718,11 +741,13 @@ define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds double, ptr %out, i64 4 + %gep = getelementptr double, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret double %result } @@ -783,11 +808,13 @@ define void @flat_atomic_add_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_add_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -844,11 +871,13 @@ define i64 @flat_atomic_add_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_add_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -899,9 +928,9 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -913,9 +942,9 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -925,15 +954,17 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_add_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s34 +; GFX9-NEXT: v_mov_b32_e32 v3, s35 +; GFX9-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -984,9 +1015,9 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -998,9 +1029,9 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1010,15 +1041,17 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-LABEL: flat_atomic_add_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v2, s34 +; GFX9-NEXT: v_mov_b32_e32 v3, s35 +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -1047,11 +1080,13 @@ define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -1080,11 +1115,13 @@ define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -1232,14 +1269,18 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_sub_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -1251,7 +1292,7 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -1268,8 +1309,8 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 ; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -1296,8 +1337,8 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -1321,8 +1362,8 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -1355,8 +1396,8 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 ; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_mov_b32_e32 v9, v1 ; GFX7-NEXT: v_sub_i32_e32 v6, vcc, v8, v2 ; GFX7-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc @@ -1383,8 +1424,8 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v8, v2 ; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc @@ -1401,28 +1442,32 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_sub_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, v9, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v10, v3, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB33_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -1431,17 +1476,17 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-LABEL: flat_atomic_sub_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1464,17 +1509,17 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-LABEL: flat_atomic_sub_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1500,9 +1545,9 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1595,19 +1640,23 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_sub_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_add_u32 s36, s4, 32 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s36 +; GFX9-NEXT: v_mov_b32_e32 v6, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -1619,7 +1668,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -1628,23 +1677,23 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_sub_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v1 ; GFX7-NEXT: v_mov_b32_e32 v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, v1 ; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 ; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -1661,23 +1710,23 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-LABEL: flat_atomic_sub_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -1697,15 +1746,15 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 ; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc ; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -1741,8 +1790,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v8, v1 ; GFX7-NEXT: v_mov_b32_e32 v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v8, v1 ; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 ; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -1774,8 +1823,8 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -1792,31 +1841,35 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-LABEL: flat_atomic_sub_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s6, v5 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v6, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s36 +; GFX9-NEXT: v_mov_b32_e32 v1, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[3:6] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB37_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -1845,11 +1898,13 @@ define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -1878,11 +1933,13 @@ define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -2030,14 +2087,18 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_and_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 ; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -2049,7 +2110,7 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -2199,28 +2260,32 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_and_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 -; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_and_b32_e32 v8, v10, v3 +; GFX9-NEXT: v_and_b32_e32 v7, v9, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB43_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -2229,16 +2294,16 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-LABEL: flat_atomic_and_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2261,16 +2326,16 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-LABEL: flat_atomic_and_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2296,8 +2361,8 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2388,18 +2453,22 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_and_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 ; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -2411,7 +2480,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -2420,16 +2489,16 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_and_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2452,16 +2521,16 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-LABEL: flat_atomic_and_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2487,8 +2556,8 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2579,30 +2648,34 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-LABEL: flat_atomic_and_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_add_u32 s36, s4, 32 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_and_b32_e32 v5, s7, v7 -; GFX9-NEXT: v_and_b32_e32 v4, s6, v6 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB47_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -2631,11 +2704,13 @@ define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -2664,11 +2739,13 @@ define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -2826,8 +2903,12 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_nand_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2835,7 +2916,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 ; GFX9-NEXT: v_not_b32_e32 v5, v4 ; GFX9-NEXT: v_not_b32_e32 v4, v8 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -2847,7 +2928,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -3007,30 +3088,34 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_nand_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 -; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 -; GFX9-NEXT: v_not_b32_e32 v5, v4 -; GFX9-NEXT: v_not_b32_e32 v4, v8 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_and_b32_e32 v0, v10, v3 +; GFX9-NEXT: v_and_b32_e32 v1, v9, v2 +; GFX9-NEXT: v_not_b32_e32 v8, v0 +; GFX9-NEXT: v_not_b32_e32 v7, v1 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB53_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -3039,16 +3124,16 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX7-LABEL: flat_atomic_nand_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3073,16 +3158,16 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX8-LABEL: flat_atomic_nand_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3110,8 +3195,8 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3208,20 +3293,24 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_nand_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_add_u32 s36, s4, 32 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 -; GFX9-NEXT: v_and_b32_e32 v6, s6, v2 +; GFX9-NEXT: v_and_b32_e32 v4, s6, v2 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 ; GFX9-NEXT: v_not_b32_e32 v1, v0 -; GFX9-NEXT: v_not_b32_e32 v0, v6 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: v_not_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -3233,7 +3322,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -3242,16 +3331,16 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_nand_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3276,16 +3365,16 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-LABEL: flat_atomic_nand_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3313,8 +3402,8 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3411,32 +3500,36 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-LABEL: flat_atomic_nand_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_and_b32_e32 v0, s7, v7 -; GFX9-NEXT: v_and_b32_e32 v1, s6, v6 -; GFX9-NEXT: v_not_b32_e32 v5, v0 -; GFX9-NEXT: v_not_b32_e32 v4, v1 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, s6, v2 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_not_b32_e32 v1, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB57_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -3505,8 +3598,12 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3514,7 +3611,7 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 ; GFX9-NEXT: v_not_b32_e32 v5, v4 ; GFX9-NEXT: v_not_b32_e32 v4, v8 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -3526,7 +3623,7 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -3595,30 +3692,34 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX9-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 -; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 -; GFX9-NEXT: v_not_b32_e32 v5, v4 -; GFX9-NEXT: v_not_b32_e32 v4, v8 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_and_b32_e32 v0, v10, v3 +; GFX9-NEXT: v_and_b32_e32 v1, v9, v2 +; GFX9-NEXT: v_not_b32_e32 v8, v0 +; GFX9-NEXT: v_not_b32_e32 v7, v1 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB59_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -3766,14 +3867,18 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_or_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 ; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -3785,7 +3890,7 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -3935,28 +4040,32 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_or_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 -; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_or_b32_e32 v8, v10, v3 +; GFX9-NEXT: v_or_b32_e32 v7, v9, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB63_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -3965,16 +4074,16 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GFX7-LABEL: flat_atomic_or_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3997,16 +4106,16 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GFX8-LABEL: flat_atomic_or_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB64_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4032,8 +4141,8 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4124,18 +4233,22 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GFX9-LABEL: flat_atomic_or_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 ; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -4147,7 +4260,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -4156,16 +4269,16 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GFX7-LABEL: flat_atomic_or_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4188,16 +4301,16 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GFX8-LABEL: flat_atomic_or_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB66_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4223,8 +4336,8 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4315,30 +4428,34 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-LABEL: flat_atomic_or_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_add_u32 s36, s4, 32 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_or_b32_e32 v5, s7, v7 -; GFX9-NEXT: v_or_b32_e32 v4, s6, v6 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB67_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -4367,11 +4484,13 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -4400,11 +4519,13 @@ define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 ; GFX9-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -4552,14 +4673,18 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_xor_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 ; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -4571,7 +4696,7 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -4721,28 +4846,32 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_xor_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 -; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_xor_b32_e32 v8, v10, v3 +; GFX9-NEXT: v_xor_b32_e32 v7, v9, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB73_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -4751,16 +4880,16 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-LABEL: flat_atomic_xor_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4783,16 +4912,16 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-LABEL: flat_atomic_xor_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4818,8 +4947,8 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4910,18 +5039,22 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_xor_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 ; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -4933,7 +5066,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -4942,16 +5075,16 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_xor_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4974,16 +5107,16 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-LABEL: flat_atomic_xor_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5009,8 +5142,8 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5101,30 +5234,34 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-LABEL: flat_atomic_xor_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_add_u32 s36, s4, 32 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_xor_b32_e32 v5, s7, v7 -; GFX9-NEXT: v_xor_b32_e32 v4, s6, v6 -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB77_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -5153,11 +5290,13 @@ define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -5186,11 +5325,13 @@ define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -5343,15 +5484,19 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_max_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -5363,7 +5508,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -5518,29 +5663,33 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_max_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v2, v9, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB83_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -5549,18 +5698,18 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-LABEL: flat_atomic_max_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5584,18 +5733,18 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-LABEL: flat_atomic_max_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5622,10 +5771,10 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5723,21 +5872,25 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_max_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s36 +; GFX9-NEXT: v_mov_b32_e32 v7, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -5749,7 +5902,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -5758,18 +5911,18 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_max_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5793,18 +5946,18 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-LABEL: flat_atomic_max_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5831,10 +5984,10 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5932,33 +6085,37 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-LABEL: flat_atomic_max_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[6:7] +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s36 +; GFX9-NEXT: v_mov_b32_e32 v1, s37 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB87_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -6038,21 +6195,27 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 +; GFX9-NEXT: s_add_u32 s4, s0, s4 +; GFX9-NEXT: s_addc_u32 s5, s1, s5 +; GFX9-NEXT: s_add_u32 s0, s4, 32 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_add_u32 s6, s4, 32 +; GFX9-NEXT: s_addc_u32 s7, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -6064,8 +6227,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -6150,38 +6313,44 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GFX9-NEXT: s_add_u32 s0, s8, s0 -; GFX9-NEXT: s_addc_u32 s1, s9, s1 +; GFX9-NEXT: s_add_u32 s2, s8, s0 +; GFX9-NEXT: s_addc_u32 s3, s9, s1 +; GFX9-NEXT: s_add_u32 s0, s2, 32 +; GFX9-NEXT: s_addc_u32 s1, s3, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: s_add_u32 s4, s2, 32 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[6:7] +; GFX9-NEXT: s_addc_u32 s5, s3, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB89_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -6284,7 +6453,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -6395,7 +6564,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -6425,11 +6594,13 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -6458,11 +6629,13 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -6615,15 +6788,19 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_umax_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -6635,7 +6812,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -6790,29 +6967,33 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_umax_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v2, v9, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB97_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -6821,18 +7002,18 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX7-LABEL: flat_atomic_umax_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6856,18 +7037,18 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX8-LABEL: flat_atomic_umax_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6894,10 +7075,10 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6995,21 +7176,25 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_umax_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s36 +; GFX9-NEXT: v_mov_b32_e32 v7, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -7021,7 +7206,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7030,18 +7215,18 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_umax_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7065,18 +7250,18 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-LABEL: flat_atomic_umax_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7103,10 +7288,10 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7204,33 +7389,37 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-LABEL: flat_atomic_umax_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7] +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s36 +; GFX9-NEXT: v_mov_b32_e32 v1, s37 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB101_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -7310,21 +7499,27 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 +; GFX9-NEXT: s_add_u32 s4, s0, s4 +; GFX9-NEXT: s_addc_u32 s5, s1, s5 +; GFX9-NEXT: s_add_u32 s0, s4, 32 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_add_u32 s6, s4, 32 +; GFX9-NEXT: s_addc_u32 s7, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -7336,8 +7531,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7422,38 +7617,44 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GFX9-NEXT: s_add_u32 s0, s8, s0 -; GFX9-NEXT: s_addc_u32 s1, s9, s1 +; GFX9-NEXT: s_add_u32 s2, s8, s0 +; GFX9-NEXT: s_addc_u32 s3, s9, s1 +; GFX9-NEXT: s_add_u32 s0, s2, 32 +; GFX9-NEXT: s_addc_u32 s1, s3, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: s_add_u32 s4, s2, 32 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[6:7] +; GFX9-NEXT: s_addc_u32 s5, s3, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[4:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB103_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -7565,7 +7766,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -7595,11 +7796,13 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -7628,11 +7831,13 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX9-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -7785,15 +7990,19 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_umin_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -7805,7 +8014,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7960,29 +8169,33 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_umin_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB110_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v2, v9, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB110_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -7991,18 +8204,18 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX7-LABEL: flat_atomic_umin_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8026,18 +8239,18 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX8-LABEL: flat_atomic_umin_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8064,10 +8277,10 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8165,21 +8378,25 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_umin_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s36 +; GFX9-NEXT: v_mov_b32_e32 v7, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -8191,7 +8408,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8200,18 +8417,18 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_umin_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8235,18 +8452,18 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-LABEL: flat_atomic_umin_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8273,10 +8490,10 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8374,33 +8591,37 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-LABEL: flat_atomic_umin_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[6:7] +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s36 +; GFX9-NEXT: v_mov_b32_e32 v1, s37 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB114_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8429,11 +8650,13 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -8462,11 +8685,13 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX9-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -8619,15 +8844,19 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_min_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -8639,7 +8868,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8794,29 +9023,33 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_min_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v2, v9, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB120_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8825,18 +9058,18 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-LABEL: flat_atomic_min_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8860,18 +9093,18 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX8-LABEL: flat_atomic_min_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8898,10 +9131,10 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8999,21 +9232,25 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-LABEL: flat_atomic_min_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s36 +; GFX9-NEXT: v_mov_b32_e32 v7, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -9025,7 +9262,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -9034,18 +9271,18 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_min_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9069,18 +9306,18 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX8-LABEL: flat_atomic_min_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9107,10 +9344,10 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9208,33 +9445,37 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-LABEL: flat_atomic_min_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[6:7] +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s36 +; GFX9-NEXT: v_mov_b32_e32 v1, s37 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB124_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -9314,21 +9555,27 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 +; GFX9-NEXT: s_add_u32 s4, s0, s4 +; GFX9-NEXT: s_addc_u32 s5, s1, s5 +; GFX9-NEXT: s_add_u32 s0, s4, 32 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_add_u32 s6, s4, 32 +; GFX9-NEXT: s_addc_u32 s7, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -9340,8 +9587,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -9426,38 +9673,44 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3 -; GFX9-NEXT: s_add_u32 s0, s8, s0 -; GFX9-NEXT: s_addc_u32 s1, s9, s1 +; GFX9-NEXT: s_add_u32 s2, s8, s0 +; GFX9-NEXT: s_addc_u32 s3, s9, s1 +; GFX9-NEXT: s_add_u32 s0, s2, 32 +; GFX9-NEXT: s_addc_u32 s1, s3, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: s_add_u32 s4, s2, 32 +; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[6:7] +; GFX9-NEXT: s_addc_u32 s5, s3, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_cbranch_execnz .LBB126_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index - %gep = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -9472,9 +9725,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-NEXT: v_mov_b32_e32 v7, s2 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9502,9 +9755,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9532,9 +9785,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_mov_b32_e32 v6, s3 ; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9664,7 +9917,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -9694,11 +9947,13 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -9727,11 +9982,13 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -9894,8 +10151,12 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB132_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9904,7 +10165,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -9916,7 +10177,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -9933,8 +10194,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GFX7-NEXT: .LBB133_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 1, v6 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] @@ -9964,8 +10225,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GFX8-NEXT: .LBB133_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, v5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v6 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] @@ -9992,8 +10253,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GFX9-NEXT: .LBB133_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] @@ -10029,8 +10290,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-NEXT: .LBB134_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v9, v1 ; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_mov_b32_e32 v9, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v8 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc ; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] @@ -10060,8 +10321,8 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX8-NEXT: .LBB134_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v8 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] @@ -10081,31 +10342,35 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB134_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v10, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v0, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB134_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -10114,16 +10379,16 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB135_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10149,16 +10414,16 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB135_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10187,8 +10452,8 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB135_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10288,21 +10553,25 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB136_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: s_add_u32 s36, s4, 32 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: s_addc_u32 s37, s5, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -10314,7 +10583,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -10323,22 +10592,22 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB137_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v1 ; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -10358,22 +10627,22 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB137_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v1 ; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mov_b32_e32 v7, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -10396,14 +10665,14 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -10441,8 +10710,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX7-NEXT: .LBB138_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v1 ; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_mov_b32_e32 v7, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -10476,8 +10745,8 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX8-NEXT: .LBB138_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v1 ; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_mov_b32_e32 v7, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -10497,33 +10766,37 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB138_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: s_add_u32 s36, s4, 32 +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: s_addc_u32 s37, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, s37 +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_cbranch_execnz .LBB138_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -10552,11 +10825,13 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -10585,11 +10860,13 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -10762,8 +11039,12 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 ; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: .LBB142_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10774,7 +11055,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -10786,7 +11067,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -10961,33 +11242,37 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 32, v5 ; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: .LBB144_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[9:10] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[9:10], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v9 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v10, s[6:7] ; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GFX9-NEXT: v_cndmask_b32_e32 v8, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] ; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_cbranch_execnz .LBB144_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -10996,18 +11281,18 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-NEXT: v_mov_b32_e32 v4, s35 ; GFX7-NEXT: flat_load_dword v2, v[0:1] ; GFX7-NEXT: flat_load_dword v3, v[3:4] -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[38:39], 0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: .LBB145_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11035,18 +11320,18 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v4, s35 ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: s_mov_b64 s[38:39], 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: .LBB145_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11077,10 +11362,10 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: .LBB145_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11190,25 +11475,29 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_mov_b64 s[38:39], 0 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: .LBB146_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] ; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s34 +; GFX9-NEXT: v_mov_b32_e32 v7, s35 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -11220,7 +11509,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -11229,18 +11518,18 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: flat_load_dword v1, v[2:3] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b64 s[38:39], 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: .LBB147_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11268,18 +11557,18 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b64 s[38:39], 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: .LBB147_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11310,10 +11599,10 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: .LBB147_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11423,37 +11712,41 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: s_mov_b64 s[38:39], 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: .LBB148_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] -; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[6:7] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v6 ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc -; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc +; GFX9-NEXT: s_add_u32 s34, s4, 32 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v7, s[36:37] +; GFX9-NEXT: s_addc_u32 s35, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_cbranch_execnz .LBB148_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -11482,11 +11775,13 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -11515,11 +11810,13 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr inbounds i64, ptr %out, i64 4 + %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll index 0283b5ff5d439..815843cf85786 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll @@ -806,9 +806,9 @@ define amdgpu_ps <8 x float> @test_fmaximum_v4f64_ss(<4 x double> inreg %a, <4 x ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-SDAG-NEXT: v_max_f64 v[2:3], s[0:1], v[0:1] ; GFX9-SDAG-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v10, 0x7ff80000 ; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-SDAG-NEXT: v_max_f64 v[4:5], s[2:3], v[1:2] ; GFX9-SDAG-NEXT: v_cmp_u_f64_e64 s[0:1], s[2:3], v[1:2] diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 29163c111fc5e..4dc7e436b52b5 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -806,8 +806,8 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v4 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll index 4cccc768d3c50..606c58d1bd470 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll @@ -806,9 +806,9 @@ define amdgpu_ps <8 x float> @test_fminimum_v4f64_ss(<4 x double> inreg %a, <4 x ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-SDAG-NEXT: v_min_f64 v[2:3], s[0:1], v[0:1] ; GFX9-SDAG-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v10, 0x7ff80000 ; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-SDAG-NEXT: v_min_f64 v[4:5], s[2:3], v[1:2] ; GFX9-SDAG-NEXT: v_cmp_u_f64_e64 s[0:1], s[2:3], v[1:2] diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index 02ce8be125afc..1694af9168e0b 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -8,6 +8,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16,GFX11-FLUSH,GFX11-FLUSH-TRUE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16,GFX11-FLUSH,GFX11-FLUSH-FAKE16 %s + ; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't ; make add an instruction if the fadd has more than one use. @@ -88,17 +89,17 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_add_u32 s2, s0, 4 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_add_f32_e64 v2, s4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mac_f32_e64 v3, s4, 2.0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mac_f32_e64 v3, s4, 2.0 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -151,16 +152,16 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: v_add_f32_e64 v2, |s2|, |s2| -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mad_f32 v3, |s2|, 2.0, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mad_f32 v3, |s2|, 2.0, v3 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -212,11 +213,11 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou ; VI-NEXT: s_add_u32 s6, s4, 4 ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: s_addc_u32 s7, s5, 0 ; VI-NEXT: v_mad_f32 v2, |s0|, 2.0, v0 ; VI-NEXT: v_mad_f32 v3, |s0|, 2.0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_addc_u32 s7, s5, 0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 @@ -324,9 +325,9 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e32 v0, s2, v0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -556,13 +557,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 +; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 +; VI-DENORM-NEXT: v_add_f16_e64 v2, s4, s4 +; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 ; VI-DENORM-NEXT: v_fma_f16 v3, s4, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 -; VI-DENORM-NEXT: v_add_f16_e64 v2, s4, s4 -; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 -; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 ; VI-DENORM-NEXT: flat_store_short v[0:1], v2 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 @@ -580,16 +581,16 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 -; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 -; VI-FLUSH-NEXT: v_add_f16_e64 v2, s4, s4 ; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2 -; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_add_f16_e64 v2, s4, s4 ; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s3 ; VI-FLUSH-NEXT: s_addc_u32 s3, s1, 0 +; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_mac_f16_e64 v3, s4, 2.0 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 -; VI-FLUSH-NEXT: v_mac_f16_e64 v3, s4, 2.0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v3 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) @@ -712,13 +713,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 +; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 +; VI-DENORM-NEXT: v_add_f16_e64 v2, |s4|, |s4| +; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 ; VI-DENORM-NEXT: v_fma_f16 v3, |s4|, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 -; VI-DENORM-NEXT: v_add_f16_e64 v2, |s4|, |s4| -; VI-DENORM-NEXT: s_add_u32 s2, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 -; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0 ; VI-DENORM-NEXT: flat_store_short v[0:1], v2 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 @@ -736,13 +737,13 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 +; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3 +; VI-FLUSH-NEXT: v_add_f16_e64 v2, |s4|, |s4| +; VI-FLUSH-NEXT: s_addc_u32 s3, s1, 0 ; VI-FLUSH-NEXT: v_mad_f16 v3, |s4|, 2.0, v0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 -; VI-FLUSH-NEXT: v_add_f16_e64 v2, |s4|, |s4| -; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 -; VI-FLUSH-NEXT: s_addc_u32 s3, s1, 0 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 @@ -869,13 +870,13 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s5, s2, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 +; VI-DENORM-NEXT: s_add_u32 s4, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s5 ; VI-DENORM-NEXT: v_fma_f16 v2, |s2|, 2.0, v0 +; VI-DENORM-NEXT: s_addc_u32 s5, s1, 0 ; VI-DENORM-NEXT: v_fma_f16 v3, |s2|, 2.0, v1 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 -; VI-DENORM-NEXT: s_add_u32 s4, s0, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 -; VI-DENORM-NEXT: s_addc_u32 s5, s1, 0 ; VI-DENORM-NEXT: flat_store_short v[0:1], v3 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0) ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s4 @@ -893,13 +894,13 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s5, s2, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3 +; VI-FLUSH-NEXT: s_add_u32 s4, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5 ; VI-FLUSH-NEXT: v_mad_f16 v2, |s2|, 2.0, v0 +; VI-FLUSH-NEXT: s_addc_u32 s5, s1, 0 ; VI-FLUSH-NEXT: v_mad_f16 v3, |s2|, 2.0, v1 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 -; VI-FLUSH-NEXT: s_add_u32 s4, s0, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 -; VI-FLUSH-NEXT: s_addc_u32 s5, s1, 0 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v3 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s4 @@ -1086,9 +1087,9 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e32 v0, s2, v0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mul_f16_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll index 6c2ab5fb15a20..4a84b011fa82f 100644 --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -134,9 +134,9 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_rndne_f32_e32 v1, s3 ; VI-NEXT: v_rndne_f32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -177,10 +177,10 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> % ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_rndne_f32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_rndne_f32_e32 v2, s2 ; VI-NEXT: v_rndne_f32_e32 v1, s1 ; VI-NEXT: v_rndne_f32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -417,8 +417,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v11, s3 -; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_mov_b32_e32 v10, s2 +; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_mov_b32_e32 v8, s0 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll index b3202cbe30d0b..f834ef89af7e9 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -7498,21 +7498,21 @@ define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(ptr addrspace(1) %o ; SI-NEXT: v_mov_b32_e32 v5, s15 ; SI-NEXT: v_add_i32_e32 v4, vcc, s14, v6 ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v7, s1 ; SI-NEXT: flat_load_dword v8, v[0:1] glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: flat_load_dword v2, v[2:3] glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: flat_load_dword v3, v[4:5] glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, s1 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v6 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; SI-NEXT: flat_load_dword v0, v[0:1] glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_trunc_f32_e32 v0, v8 ; SI-NEXT: v_fma_f32 v2, -v0, v2, v3 ; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm @@ -7532,21 +7532,21 @@ define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(ptr addrspace(1) %o ; VI-NEXT: v_mov_b32_e32 v5, s15 ; VI-NEXT: v_add_u32_e32 v4, vcc, s14, v6 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v7, s1 ; VI-NEXT: flat_load_dword v8, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v3, v[4:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_trunc_f32_e32 v0, v8 ; VI-NEXT: v_fma_f32 v2, -v0, v2, v3 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 833be2066cd54..9c9ac5f6c6f40 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -4088,9 +4088,9 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1 ; SI-NEXT: s_and_b64 s[6:7], s[4:5], exec ; SI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] ; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: v_mov_b32_e32 v3, s3 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm @@ -4108,9 +4108,9 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1 ; VI-NEXT: s_and_b64 s[6:7], s[4:5], exec ; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5] ; VI-NEXT: s_cselect_b32 s0, 0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -4147,11 +4147,11 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a ; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_bitcmp1_b32 s2, 16 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, s[2:3] ; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: flat_store_short v[0:1], v2 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll index 5424ebfcffcd1..4c994814dbc6f 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll @@ -410,9 +410,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; CI-NEXT: v_add_f32_e64 v0, s3, 1.0 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -442,9 +442,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -626,9 +626,9 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_or_b32 s3, s3, 0x80008000 ; CI-NEXT: s_or_b32 s2, s2, 0x80008000 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -652,9 +652,9 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> ; VI-NEXT: s_lshl_b32 s4, s4, 16 ; VI-NEXT: s_or_b32 s3, s3, s5 ; VI-NEXT: s_or_b32 s2, s2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 9d9a851a5507e..6a293b80f8711 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -409,9 +409,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; VI-NEXT: v_add_f16_e64 v1, s2, 1.0 ; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -518,9 +518,9 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000 ; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000 -; CIVI-NEXT: v_mov_b32_e32 v3, s1 ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 +; CIVI-NEXT: v_mov_b32_e32 v3, s1 ; CIVI-NEXT: v_mov_b32_e32 v2, s0 ; CIVI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CIVI-NEXT: s_endpgm @@ -590,9 +590,9 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_mul_f16_e64 v1, |s2|, -4.0 ; VI-NEXT: v_mul_f16_sdwa v0, |v2|, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll index 52b6d2cbaa6eb..67cc78cd921d9 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll @@ -191,9 +191,9 @@ define amdgpu_kernel void @fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], doubl ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -227,11 +227,11 @@ define amdgpu_kernel void @fneg_fabs_v2f64(ptr addrspace(1) %out, <2 x double> % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s3, 31 ; VI-NEXT: s_bitset1_b32 s1, 31 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -278,9 +278,9 @@ define amdgpu_kernel void @fneg_fabs_v4f64(ptr addrspace(1) %out, <4 x double> % ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s1 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll index 214ccedd75170..a55f29888d223 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -216,9 +216,9 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s3, 31 ; VI-NEXT: s_bitset1_b32 s2, 31 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -256,11 +256,11 @@ define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> % ; VI-NEXT: s_bitset1_b32 s2, 31 ; VI-NEXT: s_bitset1_b32 s1, 31 ; VI-NEXT: s_bitset1_b32 s0, 31 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index db08cb132a3d7..da37047048fb6 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -1664,10 +1664,10 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: s_cselect_b32 s0, s0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll index 63aadaacbeb3a..802d72129d537 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll @@ -5,6 +5,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1100 -mattr=+real-true16 -mtriple=amdgcn--amdhsa < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn--amdhsa < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s + define amdgpu_kernel void @s_fneg_bf16(ptr addrspace(1) %out, bfloat %in) #0 { ; CI-LABEL: s_fneg_bf16: ; CI: ; %bb.0: @@ -416,8 +417,8 @@ define amdgpu_kernel void @s_fneg_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 { ; CI-LABEL: s_fneg_v2bf16_nonload: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: ;;#ASMSTART ; CI-NEXT: ; def s2 ; CI-NEXT: ;;#ASMEND @@ -426,10 +427,10 @@ define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 { ; CI-NEXT: s_and_b32 s2, s2, 0xffff ; CI-NEXT: s_or_b32 s2, s2, s3 ; CI-NEXT: s_add_i32 s2, s2, 0x80000000 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -444,9 +445,9 @@ define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll index cab27fca5ab0a..490623f5795fd 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -376,9 +376,9 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { ; CIVI-NEXT: ;;#ASMEND ; CIVI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 ; CIVI-NEXT: v_mov_b32_e32 v2, s2 ; CIVI-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index 02235151a83e1..31e1be387b035 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -69,9 +69,9 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 ; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -120,11 +120,11 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl ; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 ; VI-NEXT: s_xor_b32 s1, s1, 0x80000000 ; VI-NEXT: s_xor_b32 s0, s0, 0x80000000 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll index 66d859fbd66ee..9d3e0fa635d41 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll @@ -22,9 +22,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SYS @@ -64,9 +64,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index 4b800e4d47172..c52bb8d785057 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -28,8 +28,8 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f ; VI-NEXT: v_mov_b32_e32 v0, 0x204 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -74,8 +74,8 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture % ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s2|, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -155,8 +155,8 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -203,8 +203,8 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -248,8 +248,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -301,8 +301,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur ; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s6, s6 ; VI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v0 ; VI-NEXT: s_and_b64 s[2:3], s[2:3], vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -355,8 +355,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; VI-NEXT: v_cmp_o_f32_e64 s[4:5], s2, s2 ; VI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s3|, v0 ; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -408,8 +408,8 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur ; VI-NEXT: v_cmp_u_f32_e64 s[2:3], s6, s6 ; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], |s6|, v0 ; VI-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -458,8 +458,8 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -506,8 +506,8 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -562,8 +562,8 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp ; VI-NEXT: v_cmp_class_f32_e32 vcc, s1, v0 ; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s1, v1 ; VI-NEXT: s_and_b64 s[0:1], s[0:1], vcc -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -614,8 +614,8 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; VI-NEXT: v_mov_b32_e32 v0, 0x204 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -664,8 +664,8 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -713,8 +713,8 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 15619532414ea..316187e064043 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -119,8 +119,8 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 ; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] @@ -484,8 +484,8 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 ; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] @@ -843,8 +843,8 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 ; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1197,8 +1197,8 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 ; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1586,8 +1586,8 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 ; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1934,8 +1934,8 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 ; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v0, v1 +; SDAG-NEXT: v_mov_b32_e32 v3, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: ; %bb.9: ; %Flow3 ; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll b/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll index 49204f84acb85..3d1d2a231805f 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll @@ -264,8 +264,8 @@ define amdgpu_kernel void @fptoui_f64_to_i64_to_f64(ptr addrspace(1) %out, doubl ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] -; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: s_mov_b32 s5, 0xc1f00000 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: v_fma_f64 v[2:3], v[0:1], s[4:5], v[2:3] ; GFX6-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 11954ab7e5a2c..ea64e20029e89 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -11,6 +11,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX1200,GFX1200-TRUE16 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX1200,GFX1200-FAKE16 %s + define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f16: ; SI: ; %bb.0: @@ -249,8 +250,8 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_add_u32 s2, s4, 8 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_addc_u32 s3, s5, 0 ; VI-NEXT: flat_load_ushort v0, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -2048,7 +2049,7 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 ; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX1200-FAKE16-NEXT: s_endpgm - ptr addrspace(1) %in2) #0 { + ptr addrspace(1) %in2) #1 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 %r1 = load half, ptr addrspace(1) %gep2, align 4 @@ -2263,8 +2264,8 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_add_u32 s2, s4, 16 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_addc_u32 s3, s5, 0 ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -3417,7 +3418,7 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX1200-NEXT: v_fmac_f32_e32 v1, v3, v2 ; GFX1200-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1200-NEXT: s_endpgm - ptr addrspace(1) %in2) #0 { + ptr addrspace(1) %in2) #1 { %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 %r0 = load float, ptr addrspace(1) %in1, align 4 %r1 = load float, ptr addrspace(1) %gep2, align 4 @@ -3919,8 +3920,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX10-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 ; GFX10-NEXT: s_cbranch_scc1 .LBB6_5 ; GFX10-NEXT: ; %bb.6: ; %Flow -; GFX10-NEXT: v_mov_b32_e32 v6, v10 ; GFX10-NEXT: v_mov_b32_e32 v13, s2 +; GFX10-NEXT: v_mov_b32_e32 v6, v10 ; GFX10-NEXT: v_mov_b32_e32 v7, v11 ; GFX10-NEXT: .LBB6_7: ; %frem.loop_exit ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 25, v13 @@ -4821,7 +4822,7 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX1200-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX1200-NEXT: global_store_b64 v12, v[0:1], s[0:1] ; GFX1200-NEXT: s_endpgm - ptr addrspace(1) %in2) #0 { + ptr addrspace(1) %in2) #1 { %r0 = load double, ptr addrspace(1) %in1, align 8 %r1 = load double, ptr addrspace(1) %in2, align 8 %r2 = frem afn double %r0, %r1 @@ -5262,8 +5263,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_add_u32 s2, s4, 16 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_addc_u32 s3, s5, 0 ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s2 @@ -16843,8 +16844,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_ldexp_f64 v[10:11], v[10:11], 26 ; GFX10-NEXT: s_cbranch_scc1 .LBB13_5 ; GFX10-NEXT: ; %bb.6: ; %Flow51 -; GFX10-NEXT: v_mov_b32_e32 v10, v14 ; GFX10-NEXT: v_mov_b32_e32 v17, s2 +; GFX10-NEXT: v_mov_b32_e32 v10, v14 ; GFX10-NEXT: v_mov_b32_e32 v11, v15 ; GFX10-NEXT: .LBB13_7: ; %frem.loop_exit24 ; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 25, v17 @@ -16915,8 +16916,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_ldexp_f64 v[12:13], v[12:13], 26 ; GFX10-NEXT: s_cbranch_scc1 .LBB13_13 ; GFX10-NEXT: ; %bb.14: ; %Flow -; GFX10-NEXT: v_mov_b32_e32 v12, v16 ; GFX10-NEXT: v_mov_b32_e32 v19, s2 +; GFX10-NEXT: v_mov_b32_e32 v12, v16 ; GFX10-NEXT: v_mov_b32_e32 v13, v17 ; GFX10-NEXT: .LBB13_15: ; %frem.loop_exit ; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 25, v19 @@ -17562,1360 +17563,5 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ret void } - -define amdgpu_kernel void @frem_v2f64_const_zero_num(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { -; SI-LABEL: frem_v2f64_const_zero_num: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[0:1] -; SI-NEXT: s_and_b64 s[2:3], vcc, exec -; SI-NEXT: s_cselect_b32 s8, 0x7ff80000, 0 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[2:3] -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cselect_b32 s4, 0x7ff80000, 0 -; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: s_endpgm -; -; CI-LABEL: frem_v2f64_const_zero_num: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s4, s2 -; CI-NEXT: s_mov_b32 s5, s3 -; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[0:1] -; CI-NEXT: v_mov_b32_e32 v0, 0 -; CI-NEXT: s_and_b64 s[2:3], vcc, exec -; CI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[2:3] -; CI-NEXT: s_cselect_b32 s8, 0x7ff80000, 0 -; CI-NEXT: s_mov_b32 s2, s6 -; CI-NEXT: s_mov_b32 s3, s7 -; CI-NEXT: v_mov_b32_e32 v1, s8 -; CI-NEXT: v_mov_b32_e32 v2, v0 -; CI-NEXT: s_and_b64 s[4:5], vcc, exec -; CI-NEXT: s_cselect_b32 s4, 0x7ff80000, 0 -; CI-NEXT: v_mov_b32_e32 v3, s4 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; CI-NEXT: s_endpgm -; -; VI-LABEL: frem_v2f64_const_zero_num: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: s_and_b64 s[2:3], vcc, exec -; VI-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[2:3] -; VI-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: s_and_b64 s[0:1], vcc, exec -; VI-NEXT: s_cselect_b32 s0, 0x7ff80000, 0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: frem_v2f64_const_zero_num: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_and_b64 s[2:3], vcc, exec -; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, 0, v[3:4] -; GFX9-NEXT: s_cselect_b32 s4, 0x7ff80000, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: s_and_b64 s[2:3], vcc, exec -; GFX9-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: frem_v2f64_const_zero_num: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: s_and_b32 s2, vcc_lo, exec_lo -; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4] -; GFX10-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_and_b32 s3, vcc_lo, exec_lo -; GFX10-NEXT: s_cselect_b32 s3, 0x7ff80000, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: frem_v2f64_const_zero_num: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[1:4], v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2] -; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo -; GFX11-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4] -; GFX11-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0 -; GFX11-NEXT: s_and_b32 s3, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s3, 0x7ff80000, 0 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 -; GFX11-NEXT: global_store_b128 v0, v[0:3], s[0:1] -; GFX11-NEXT: s_endpgm -; -; GFX1150-LABEL: frem_v2f64_const_zero_num: -; GFX1150: ; %bb.0: -; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-NEXT: v_mov_b32_e32 v0, 0 -; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: global_load_b128 v[1:4], v0, s[2:3] -; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2] -; GFX1150-NEXT: s_and_b32 s2, vcc_lo, exec_lo -; GFX1150-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4] -; GFX1150-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 -; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX1150-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0 -; GFX1150-NEXT: s_and_b32 s3, vcc_lo, exec_lo -; GFX1150-NEXT: s_cselect_b32 s3, 0x7ff80000, 0 -; GFX1150-NEXT: v_mov_b32_e32 v3, s3 -; GFX1150-NEXT: global_store_b128 v0, v[0:3], s[0:1] -; GFX1150-NEXT: s_endpgm -; -; GFX1200-LABEL: frem_v2f64_const_zero_num: -; GFX1200: ; %bb.0: -; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1200-NEXT: v_mov_b32_e32 v0, 0 -; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: global_load_b128 v[1:4], v0, s[2:3] -; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2] -; GFX1200-NEXT: s_and_b32 s2, vcc_lo, exec_lo -; GFX1200-NEXT: v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4] -; GFX1200-NEXT: s_cselect_b32 s2, 0x7ff80000, 0 -; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1200-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0 -; GFX1200-NEXT: s_and_b32 s3, vcc_lo, exec_lo -; GFX1200-NEXT: s_cselect_b32 s3, 0x7ff80000, 0 -; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX1200-NEXT: v_mov_b32_e32 v3, s3 -; GFX1200-NEXT: global_store_b128 v0, v[0:3], s[0:1] -; GFX1200-NEXT: s_endpgm - %r0 = load <2 x double>, ptr addrspace(1) %in, align 16 - %r1 = frem <2 x double> , %r0 - store <2 x double> %r1, ptr addrspace(1) %out, align 16 - ret void -} - -define amdgpu_kernel void @frem_v2f64_const_one_denum(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { -; SI-LABEL: frem_v2f64_const_one_denum: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0 -; SI-NEXT: s_and_b64 vcc, exec, s[2:3] -; SI-NEXT: s_cbranch_vccz .LBB15_2 -; SI-NEXT: ; %bb.1: ; %frem.else16 -; SI-NEXT: v_and_b32_e32 v4, 0x80000000, v1 -; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0 -; SI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc -; SI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc -; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB15_3 -; SI-NEXT: s_branch .LBB15_8 -; SI-NEXT: .LBB15_2: -; SI-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB15_3: ; %frem.compute15 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, 0x7ff00000 -; SI-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3] -; SI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| -; SI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; SI-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] -; SI-NEXT: s_and_b64 s[2:3], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s2, v6 -; SI-NEXT: s_cselect_b32 s3, s2, 0 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_add_i32 s5, s3, -1 -; SI-NEXT: v_ldexp_f64 v[5:6], v[4:5], 26 -; SI-NEXT: s_cmp_lt_i32 s5, 27 -; SI-NEXT: s_cbranch_scc1 .LBB15_7 -; SI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader -; SI-NEXT: s_add_i32 s5, s3, 25 -; SI-NEXT: v_mov_b32_e32 v9, 0x43300000 -; SI-NEXT: v_mov_b32_e32 v4, 0 -; SI-NEXT: s_mov_b32 s3, 0x432fffff -; SI-NEXT: .LBB15_5: ; %frem.loop_body23 -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v8, v6 -; SI-NEXT: v_mov_b32_e32 v7, v5 -; SI-NEXT: v_bfi_b32 v5, s4, v9, v8 -; SI-NEXT: v_add_f64 v[10:11], v[7:8], v[4:5] -; SI-NEXT: v_add_f64 v[5:6], v[10:11], -v[4:5] -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[7:8]|, s[2:3] -; SI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; SI-NEXT: v_add_f64 v[5:6], v[7:8], -v[5:6] -; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[5:6] -; SI-NEXT: v_add_f64 v[10:11], v[5:6], 1.0 -; SI-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc -; SI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc -; SI-NEXT: v_ldexp_f64 v[5:6], v[5:6], 26 -; SI-NEXT: s_sub_i32 s5, s5, 26 -; SI-NEXT: s_cmp_gt_i32 s5, 26 -; SI-NEXT: s_cbranch_scc1 .LBB15_5 -; SI-NEXT: ; %bb.6: ; %Flow50 -; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: .LBB15_7: ; %frem.loop_exit24 -; SI-NEXT: s_sub_i32 s2, s5, 25 -; SI-NEXT: v_ldexp_f64 v[4:5], v[5:6], s2 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s3, 0x432fffff -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[4:5]|, s[2:3] -; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: v_mov_b32_e32 v6, 0x43300000 -; SI-NEXT: v_bfi_b32 v7, s2, v6, v5 -; SI-NEXT: v_mov_b32_e32 v6, 0 -; SI-NEXT: v_add_f64 v[8:9], v[4:5], v[6:7] -; SI-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v5, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v6, v4, vcc -; SI-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] -; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] -; SI-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 -; SI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; SI-NEXT: v_bfi_b32 v5, s2, v5, v1 -; SI-NEXT: .LBB15_8: -; SI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0 -; SI-NEXT: s_and_b64 vcc, exec, s[2:3] -; SI-NEXT: s_cbranch_vccz .LBB15_10 -; SI-NEXT: ; %bb.9: ; %frem.else -; SI-NEXT: v_and_b32_e32 v6, 0x80000000, v3 -; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0 -; SI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc -; SI-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; SI-NEXT: s_mov_b64 vcc, exec -; SI-NEXT: s_cbranch_execz .LBB15_11 -; SI-NEXT: s_branch .LBB15_16 -; SI-NEXT: .LBB15_10: -; SI-NEXT: ; implicit-def: $vgpr6_vgpr7 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB15_11: ; %frem.compute -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_and_b32_e32 v8, 0x7fffffff, v3 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, 0x7ff00000 -; SI-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[2:3] -; SI-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| -; SI-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc -; SI-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] -; SI-NEXT: s_and_b64 s[2:3], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s2, v8 -; SI-NEXT: s_cselect_b32 s3, s2, 0 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_add_i32 s5, s3, -1 -; SI-NEXT: v_ldexp_f64 v[7:8], v[6:7], 26 -; SI-NEXT: s_cmp_lt_i32 s5, 27 -; SI-NEXT: s_cbranch_scc1 .LBB15_15 -; SI-NEXT: ; %bb.12: ; %frem.loop_body.preheader -; SI-NEXT: s_add_i32 s5, s3, 25 -; SI-NEXT: v_mov_b32_e32 v11, 0x43300000 -; SI-NEXT: v_mov_b32_e32 v6, 0 -; SI-NEXT: s_mov_b32 s3, 0x432fffff -; SI-NEXT: .LBB15_13: ; %frem.loop_body -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v10, v8 -; SI-NEXT: v_mov_b32_e32 v9, v7 -; SI-NEXT: v_bfi_b32 v7, s4, v11, v10 -; SI-NEXT: v_add_f64 v[12:13], v[9:10], v[6:7] -; SI-NEXT: v_add_f64 v[7:8], v[12:13], -v[6:7] -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[9:10]|, s[2:3] -; SI-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; SI-NEXT: v_add_f64 v[7:8], v[9:10], -v[7:8] -; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[7:8] -; SI-NEXT: v_add_f64 v[12:13], v[7:8], 1.0 -; SI-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc -; SI-NEXT: v_ldexp_f64 v[7:8], v[7:8], 26 -; SI-NEXT: s_sub_i32 s5, s5, 26 -; SI-NEXT: s_cmp_gt_i32 s5, 26 -; SI-NEXT: s_cbranch_scc1 .LBB15_13 -; SI-NEXT: ; %bb.14: ; %Flow -; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: v_mov_b32_e32 v8, v10 -; SI-NEXT: .LBB15_15: ; %frem.loop_exit -; SI-NEXT: s_sub_i32 s2, s5, 25 -; SI-NEXT: v_ldexp_f64 v[6:7], v[7:8], s2 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s3, 0x432fffff -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |v[6:7]|, s[2:3] -; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: v_mov_b32_e32 v8, 0x43300000 -; SI-NEXT: v_bfi_b32 v9, s2, v8, v7 -; SI-NEXT: v_mov_b32_e32 v8, 0 -; SI-NEXT: v_add_f64 v[10:11], v[6:7], v[8:9] -; SI-NEXT: v_add_f64 v[8:9], v[10:11], -v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v9, v9, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc -; SI-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] -; SI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] -; SI-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 -; SI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; SI-NEXT: v_bfi_b32 v7, s2, v7, v3 -; SI-NEXT: .LBB15_16: ; %Flow49 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s5, 0x7ff00000 -; SI-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[4:5] -; SI-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; SI-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[4:5] -; SI-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: s_endpgm -; -; CI-LABEL: frem_v2f64_const_one_denum: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s4, s2 -; CI-NEXT: s_mov_b32 s5, s3 -; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0 -; CI-NEXT: s_and_b64 vcc, exec, s[2:3] -; CI-NEXT: s_cbranch_vccz .LBB15_2 -; CI-NEXT: ; %bb.1: ; %frem.else16 -; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0 -; CI-NEXT: v_and_b32_e32 v4, 0x80000000, v1 -; CI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc -; CI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc -; CI-NEXT: s_cbranch_execz .LBB15_3 -; CI-NEXT: s_branch .LBB15_8 -; CI-NEXT: .LBB15_2: -; CI-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CI-NEXT: .LBB15_3: ; %frem.compute15 -; CI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| -; CI-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] -; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v6 -; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v8 -; CI-NEXT: s_cbranch_vccnz .LBB15_7 -; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader -; CI-NEXT: v_add_i32_e32 v8, vcc, 25, v6 -; CI-NEXT: .LBB15_5: ; %frem.loop_body23 -; CI-NEXT: ; =>This Inner Loop Header: Depth=1 -; CI-NEXT: v_mov_b32_e32 v7, v5 -; CI-NEXT: v_mov_b32_e32 v6, v4 -; CI-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] -; CI-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] -; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] -; CI-NEXT: v_add_f64 v[9:10], v[4:5], 1.0 -; CI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc -; CI-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc -; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; CI-NEXT: v_subrev_i32_e32 v8, vcc, 26, v8 -; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v8 -; CI-NEXT: s_cbranch_vccnz .LBB15_5 -; CI-NEXT: ; %bb.6: ; %Flow50 -; CI-NEXT: v_mov_b32_e32 v4, v6 -; CI-NEXT: v_mov_b32_e32 v5, v7 -; CI-NEXT: .LBB15_7: ; %frem.loop_exit24 -; CI-NEXT: v_subrev_i32_e32 v6, vcc, 25, v8 -; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; CI-NEXT: s_brev_b32 s2, -2 -; CI-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] -; CI-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] -; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] -; CI-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 -; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; CI-NEXT: v_bfi_b32 v5, s2, v5, v1 -; CI-NEXT: .LBB15_8: -; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0 -; CI-NEXT: s_and_b64 vcc, exec, s[2:3] -; CI-NEXT: s_cbranch_vccz .LBB15_10 -; CI-NEXT: ; %bb.9: ; %frem.else -; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0 -; CI-NEXT: v_and_b32_e32 v6, 0x80000000, v3 -; CI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc -; CI-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; CI-NEXT: s_cbranch_execz .LBB15_11 -; CI-NEXT: s_branch .LBB15_16 -; CI-NEXT: .LBB15_10: -; CI-NEXT: ; implicit-def: $vgpr6_vgpr7 -; CI-NEXT: .LBB15_11: ; %frem.compute -; CI-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| -; CI-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] -; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; CI-NEXT: v_add_i32_e32 v10, vcc, -1, v8 -; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v10 -; CI-NEXT: s_cbranch_vccnz .LBB15_15 -; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader -; CI-NEXT: v_add_i32_e32 v10, vcc, 25, v8 -; CI-NEXT: .LBB15_13: ; %frem.loop_body -; CI-NEXT: ; =>This Inner Loop Header: Depth=1 -; CI-NEXT: v_mov_b32_e32 v9, v7 -; CI-NEXT: v_mov_b32_e32 v8, v6 -; CI-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] -; CI-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] -; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] -; CI-NEXT: v_add_f64 v[11:12], v[6:7], 1.0 -; CI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc -; CI-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc -; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; CI-NEXT: v_subrev_i32_e32 v10, vcc, 26, v10 -; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v10 -; CI-NEXT: s_cbranch_vccnz .LBB15_13 -; CI-NEXT: ; %bb.14: ; %Flow -; CI-NEXT: v_mov_b32_e32 v6, v8 -; CI-NEXT: v_mov_b32_e32 v7, v9 -; CI-NEXT: .LBB15_15: ; %frem.loop_exit -; CI-NEXT: v_subrev_i32_e32 v8, vcc, 25, v10 -; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 -; CI-NEXT: s_brev_b32 s2, -2 -; CI-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] -; CI-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] -; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] -; CI-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 -; CI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; CI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; CI-NEXT: v_bfi_b32 v7, s2, v7, v3 -; CI-NEXT: .LBB15_16: ; %Flow49 -; CI-NEXT: s_mov_b32 s4, 0 -; CI-NEXT: s_mov_b32 s5, 0x7ff00000 -; CI-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[4:5] -; CI-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc -; CI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; CI-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[4:5] -; CI-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc -; CI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; CI-NEXT: s_endpgm -; -; VI-LABEL: frem_v2f64_const_one_denum: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0 -; VI-NEXT: s_and_b64 vcc, exec, s[2:3] -; VI-NEXT: s_cbranch_vccz .LBB15_2 -; VI-NEXT: ; %bb.1: ; %frem.else16 -; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0 -; VI-NEXT: v_and_b32_e32 v4, 0x80000000, v1 -; VI-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc -; VI-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc -; VI-NEXT: s_cbranch_execz .LBB15_3 -; VI-NEXT: s_branch .LBB15_8 -; VI-NEXT: .LBB15_2: -; VI-NEXT: ; implicit-def: $vgpr4_vgpr5 -; VI-NEXT: .LBB15_3: ; %frem.compute15 -; VI-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| -; VI-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] -; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v6 -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v8 -; VI-NEXT: s_cbranch_vccnz .LBB15_7 -; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader -; VI-NEXT: v_add_u32_e32 v8, vcc, 25, v6 -; VI-NEXT: .LBB15_5: ; %frem.loop_body23 -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v7, v5 -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] -; VI-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] -; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] -; VI-NEXT: v_add_f64 v[9:10], v[4:5], 1.0 -; VI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc -; VI-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc -; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; VI-NEXT: v_subrev_u32_e32 v8, vcc, 26, v8 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v8 -; VI-NEXT: s_cbranch_vccnz .LBB15_5 -; VI-NEXT: ; %bb.6: ; %Flow50 -; VI-NEXT: v_mov_b32_e32 v4, v6 -; VI-NEXT: v_mov_b32_e32 v5, v7 -; VI-NEXT: .LBB15_7: ; %frem.loop_exit24 -; VI-NEXT: v_subrev_u32_e32 v6, vcc, 25, v8 -; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; VI-NEXT: s_brev_b32 s2, -2 -; VI-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] -; VI-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] -; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] -; VI-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 -; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; VI-NEXT: v_bfi_b32 v5, s2, v5, v1 -; VI-NEXT: .LBB15_8: -; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0 -; VI-NEXT: s_and_b64 vcc, exec, s[2:3] -; VI-NEXT: s_cbranch_vccz .LBB15_10 -; VI-NEXT: ; %bb.9: ; %frem.else -; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0 -; VI-NEXT: v_and_b32_e32 v6, 0x80000000, v3 -; VI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc -; VI-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; VI-NEXT: s_cbranch_execz .LBB15_11 -; VI-NEXT: s_branch .LBB15_16 -; VI-NEXT: .LBB15_10: -; VI-NEXT: ; implicit-def: $vgpr6_vgpr7 -; VI-NEXT: .LBB15_11: ; %frem.compute -; VI-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| -; VI-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] -; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; VI-NEXT: v_add_u32_e32 v10, vcc, -1, v8 -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v10 -; VI-NEXT: s_cbranch_vccnz .LBB15_15 -; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader -; VI-NEXT: v_add_u32_e32 v10, vcc, 25, v8 -; VI-NEXT: .LBB15_13: ; %frem.loop_body -; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v9, v7 -; VI-NEXT: v_mov_b32_e32 v8, v6 -; VI-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] -; VI-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] -; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] -; VI-NEXT: v_add_f64 v[11:12], v[6:7], 1.0 -; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc -; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; VI-NEXT: v_subrev_u32_e32 v10, vcc, 26, v10 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v10 -; VI-NEXT: s_cbranch_vccnz .LBB15_13 -; VI-NEXT: ; %bb.14: ; %Flow -; VI-NEXT: v_mov_b32_e32 v6, v8 -; VI-NEXT: v_mov_b32_e32 v7, v9 -; VI-NEXT: .LBB15_15: ; %frem.loop_exit -; VI-NEXT: v_subrev_u32_e32 v8, vcc, 25, v10 -; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 -; VI-NEXT: s_brev_b32 s2, -2 -; VI-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] -; VI-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] -; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] -; VI-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 -; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; VI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; VI-NEXT: v_bfi_b32 v7, s2, v7, v3 -; VI-NEXT: .LBB15_16: ; %Flow49 -; VI-NEXT: s_mov_b32 s2, 0 -; VI-NEXT: s_mov_b32 s3, 0x7ff00000 -; VI-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[2:3] -; VI-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; VI-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; VI-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[2:3] -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc -; VI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: frem_v2f64_const_one_denum: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0 -; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX9-NEXT: s_cbranch_vccz .LBB15_2 -; GFX9-NEXT: ; %bb.1: ; %frem.else16 -; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0 -; GFX9-NEXT: v_and_b32_e32 v4, 0x80000000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc -; GFX9-NEXT: s_cbranch_execz .LBB15_3 -; GFX9-NEXT: s_branch .LBB15_8 -; GFX9-NEXT: .LBB15_2: -; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX9-NEXT: .LBB15_3: ; %frem.compute15 -; GFX9-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| -; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] -; GFX9-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX9-NEXT: v_add_u32_e32 v8, -1, v6 -; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v8 -; GFX9-NEXT: s_cbranch_vccnz .LBB15_7 -; GFX9-NEXT: ; %bb.4: ; %frem.loop_body23.preheader -; GFX9-NEXT: v_add_u32_e32 v8, 25, v6 -; GFX9-NEXT: .LBB15_5: ; %frem.loop_body23 -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] -; GFX9-NEXT: v_subrev_u32_e32 v8, 26, v8 -; GFX9-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] -; GFX9-NEXT: v_add_f64 v[9:10], v[4:5], 1.0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc -; GFX9-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 26, v8 -; GFX9-NEXT: s_cbranch_vccnz .LBB15_5 -; GFX9-NEXT: ; %bb.6: ; %Flow50 -; GFX9-NEXT: v_mov_b32_e32 v4, v6 -; GFX9-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-NEXT: .LBB15_7: ; %frem.loop_exit24 -; GFX9-NEXT: v_subrev_u32_e32 v6, 25, v8 -; GFX9-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; GFX9-NEXT: s_brev_b32 s2, -2 -; GFX9-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] -; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5] -; GFX9-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_bfi_b32 v5, s2, v5, v1 -; GFX9-NEXT: .LBB15_8: -; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0 -; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX9-NEXT: s_cbranch_vccz .LBB15_10 -; GFX9-NEXT: ; %bb.9: ; %frem.else -; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0 -; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; GFX9-NEXT: s_cbranch_execz .LBB15_11 -; GFX9-NEXT: s_branch .LBB15_16 -; GFX9-NEXT: .LBB15_10: -; GFX9-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX9-NEXT: .LBB15_11: ; %frem.compute -; GFX9-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| -; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] -; GFX9-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX9-NEXT: v_add_u32_e32 v10, -1, v8 -; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v10 -; GFX9-NEXT: s_cbranch_vccnz .LBB15_15 -; GFX9-NEXT: ; %bb.12: ; %frem.loop_body.preheader -; GFX9-NEXT: v_add_u32_e32 v10, 25, v8 -; GFX9-NEXT: .LBB15_13: ; %frem.loop_body -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] -; GFX9-NEXT: v_subrev_u32_e32 v10, 26, v10 -; GFX9-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] -; GFX9-NEXT: v_add_f64 v[11:12], v[6:7], 1.0 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc -; GFX9-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 26, v10 -; GFX9-NEXT: s_cbranch_vccnz .LBB15_13 -; GFX9-NEXT: ; %bb.14: ; %Flow -; GFX9-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-NEXT: .LBB15_15: ; %frem.loop_exit -; GFX9-NEXT: v_subrev_u32_e32 v8, 25, v10 -; GFX9-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 -; GFX9-NEXT: s_brev_b32 s2, -2 -; GFX9-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] -; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7] -; GFX9-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GFX9-NEXT: v_bfi_b32 v7, s2, v7, v3 -; GFX9-NEXT: .LBB15_16: ; %Flow49 -; GFX9-NEXT: s_mov_b32 s2, 0 -; GFX9-NEXT: s_mov_b32 s3, 0x7ff00000 -; GFX9-NEXT: v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; GFX9-NEXT: v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: frem_v2f64_const_one_denum: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0 -; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 -; GFX10-NEXT: s_cbranch_vccz .LBB15_2 -; GFX10-NEXT: ; %bb.1: ; %frem.else16 -; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0 -; GFX10-NEXT: v_and_b32_e32 v4, 0x80000000, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB15_3 -; GFX10-NEXT: s_branch .LBB15_8 -; GFX10-NEXT: .LBB15_2: -; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX10-NEXT: .LBB15_3: ; %frem.compute15 -; GFX10-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| -; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] -; GFX10-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX10-NEXT: v_add_nc_u32_e32 v8, -1, v6 -; GFX10-NEXT: v_readfirstlane_b32 s2, v6 -; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8 -; GFX10-NEXT: s_cbranch_vccnz .LBB15_7 -; GFX10-NEXT: ; %bb.4: ; %frem.loop_body23.preheader -; GFX10-NEXT: s_add_i32 s2, s2, 25 -; GFX10-NEXT: .LBB15_5: ; %frem.loop_body23 -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: s_sub_i32 s2, s2, 26 -; GFX10-NEXT: s_cmp_gt_i32 s2, 26 -; GFX10-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] -; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] -; GFX10-NEXT: v_add_f64 v[8:9], v[4:5], 1.0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo -; GFX10-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX10-NEXT: s_cbranch_scc1 .LBB15_5 -; GFX10-NEXT: ; %bb.6: ; %Flow50 -; GFX10-NEXT: v_mov_b32_e32 v4, v6 -; GFX10-NEXT: v_mov_b32_e32 v8, s2 -; GFX10-NEXT: v_mov_b32_e32 v5, v7 -; GFX10-NEXT: .LBB15_7: ; %frem.loop_exit24 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 25, v8 -; GFX10-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; GFX10-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] -; GFX10-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] -; GFX10-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo -; GFX10-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 -; GFX10-NEXT: .LBB15_8: -; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0 -; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 -; GFX10-NEXT: s_cbranch_vccz .LBB15_10 -; GFX10-NEXT: ; %bb.9: ; %frem.else -; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0 -; GFX10-NEXT: v_and_b32_e32 v6, 0x80000000, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo -; GFX10-NEXT: s_cbranch_execz .LBB15_11 -; GFX10-NEXT: s_branch .LBB15_16 -; GFX10-NEXT: .LBB15_10: -; GFX10-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX10-NEXT: .LBB15_11: ; %frem.compute -; GFX10-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| -; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] -; GFX10-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX10-NEXT: v_add_nc_u32_e32 v10, -1, v8 -; GFX10-NEXT: v_readfirstlane_b32 s2, v8 -; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10 -; GFX10-NEXT: s_cbranch_vccnz .LBB15_15 -; GFX10-NEXT: ; %bb.12: ; %frem.loop_body.preheader -; GFX10-NEXT: s_add_i32 s2, s2, 25 -; GFX10-NEXT: .LBB15_13: ; %frem.loop_body -; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v9, v7 -; GFX10-NEXT: v_mov_b32_e32 v8, v6 -; GFX10-NEXT: s_sub_i32 s2, s2, 26 -; GFX10-NEXT: s_cmp_gt_i32 s2, 26 -; GFX10-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] -; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] -; GFX10-NEXT: v_add_f64 v[10:11], v[6:7], 1.0 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX10-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX10-NEXT: s_cbranch_scc1 .LBB15_13 -; GFX10-NEXT: ; %bb.14: ; %Flow -; GFX10-NEXT: v_mov_b32_e32 v6, v8 -; GFX10-NEXT: v_mov_b32_e32 v10, s2 -; GFX10-NEXT: v_mov_b32_e32 v7, v9 -; GFX10-NEXT: .LBB15_15: ; %frem.loop_exit -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 25, v10 -; GFX10-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 -; GFX10-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] -; GFX10-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] -; GFX10-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo -; GFX10-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3 -; GFX10-NEXT: .LBB15_16: ; %Flow49 -; GFX10-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo -; GFX10-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]| -; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7ff80000, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: frem_v2f64_const_one_denum: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0 -; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_vccz .LBB15_2 -; GFX11-NEXT: ; %bb.1: ; %frem.else16 -; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0 -; GFX11-NEXT: v_and_b32_e32 v4, 0x80000000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo -; GFX11-NEXT: s_cbranch_execz .LBB15_3 -; GFX11-NEXT: s_branch .LBB15_8 -; GFX11-NEXT: .LBB15_2: -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX11-NEXT: .LBB15_3: ; %frem.compute15 -; GFX11-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| -; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX11-NEXT: v_add_nc_u32_e32 v8, -1, v6 -; GFX11-NEXT: v_readfirstlane_b32 s2, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8 -; GFX11-NEXT: s_cbranch_vccnz .LBB15_7 -; GFX11-NEXT: ; %bb.4: ; %frem.loop_body23.preheader -; GFX11-NEXT: s_add_i32 s2, s2, 25 -; GFX11-NEXT: .LBB15_5: ; %frem.loop_body23 -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_sub_i32 s2, s2, 26 -; GFX11-NEXT: s_cmp_gt_i32 s2, 26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] -; GFX11-NEXT: v_add_f64 v[8:9], v[4:5], 1.0 -; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX11-NEXT: s_cbranch_scc1 .LBB15_5 -; GFX11-NEXT: ; %bb.6: ; %Flow50 -; GFX11-NEXT: v_mov_b32_e32 v4, v6 -; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7 -; GFX11-NEXT: .LBB15_7: ; %frem.loop_exit24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 25, v8 -; GFX11-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] -; GFX11-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 -; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 -; GFX11-NEXT: .LBB15_8: -; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0 -; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_vccz .LBB15_10 -; GFX11-NEXT: ; %bb.9: ; %frem.else -; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0 -; GFX11-NEXT: v_and_b32_e32 v6, 0x80000000, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo -; GFX11-NEXT: s_cbranch_execz .LBB15_11 -; GFX11-NEXT: s_branch .LBB15_16 -; GFX11-NEXT: .LBB15_10: -; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX11-NEXT: .LBB15_11: ; %frem.compute -; GFX11-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| -; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX11-NEXT: v_add_nc_u32_e32 v10, -1, v8 -; GFX11-NEXT: v_readfirstlane_b32 s2, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10 -; GFX11-NEXT: s_cbranch_vccnz .LBB15_15 -; GFX11-NEXT: ; %bb.12: ; %frem.loop_body.preheader -; GFX11-NEXT: s_add_i32 s2, s2, 25 -; GFX11-NEXT: .LBB15_13: ; %frem.loop_body -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 -; GFX11-NEXT: s_sub_i32 s2, s2, 26 -; GFX11-NEXT: s_cmp_gt_i32 s2, 26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] -; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] -; GFX11-NEXT: v_add_f64 v[10:11], v[6:7], 1.0 -; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX11-NEXT: s_cbranch_scc1 .LBB15_13 -; GFX11-NEXT: ; %bb.14: ; %Flow -; GFX11-NEXT: v_mov_b32_e32 v6, v8 -; GFX11-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9 -; GFX11-NEXT: .LBB15_15: ; %frem.loop_exit -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 25, v10 -; GFX11-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] -; GFX11-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3 -; GFX11-NEXT: .LBB15_16: ; %Flow49 -; GFX11-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo -; GFX11-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]| -; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_endpgm -; -; GFX1150-LABEL: frem_v2f64_const_one_denum: -; GFX1150: ; %bb.0: -; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-NEXT: v_mov_b32_e32 v0, 0 -; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: global_load_b128 v[0:3], v0, s[2:3] -; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0 -; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2 -; GFX1150-NEXT: s_cbranch_vccz .LBB15_2 -; GFX1150-NEXT: ; %bb.1: ; %frem.else16 -; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0 -; GFX1150-NEXT: v_and_b32_e32 v4, 0x80000000, v1 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1150-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo -; GFX1150-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo -; GFX1150-NEXT: s_cbranch_execz .LBB15_3 -; GFX1150-NEXT: s_branch .LBB15_8 -; GFX1150-NEXT: .LBB15_2: -; GFX1150-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1150-NEXT: .LBB15_3: ; %frem.compute15 -; GFX1150-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| -; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1150-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX1150-NEXT: v_add_nc_u32_e32 v8, -1, v6 -; GFX1150-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8 -; GFX1150-NEXT: s_cbranch_vccnz .LBB15_7 -; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body23.preheader -; GFX1150-NEXT: s_add_i32 s2, s2, 25 -; GFX1150-NEXT: .LBB15_5: ; %frem.loop_body23 -; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1150-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX1150-NEXT: s_sub_i32 s2, s2, 26 -; GFX1150-NEXT: s_cmp_gt_i32 s2, 26 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] -; GFX1150-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] -; GFX1150-NEXT: v_add_f64 v[8:9], v[4:5], 1.0 -; GFX1150-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX1150-NEXT: s_cbranch_scc1 .LBB15_5 -; GFX1150-NEXT: ; %bb.6: ; %Flow50 -; GFX1150-NEXT: v_mov_b32_e32 v4, v6 -; GFX1150-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7 -; GFX1150-NEXT: .LBB15_7: ; %frem.loop_exit24 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_subrev_nc_u32_e32 v6, 25, v8 -; GFX1150-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] -; GFX1150-NEXT: v_add_f64 v[4:5], v[4:5], -v[6:7] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] -; GFX1150-NEXT: v_add_f64 v[6:7], v[4:5], 1.0 -; GFX1150-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 -; GFX1150-NEXT: .LBB15_8: -; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0 -; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2 -; GFX1150-NEXT: s_cbranch_vccz .LBB15_10 -; GFX1150-NEXT: ; %bb.9: ; %frem.else -; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0 -; GFX1150-NEXT: v_and_b32_e32 v6, 0x80000000, v3 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1150-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo -; GFX1150-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo -; GFX1150-NEXT: s_cbranch_execz .LBB15_11 -; GFX1150-NEXT: s_branch .LBB15_16 -; GFX1150-NEXT: .LBB15_10: -; GFX1150-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX1150-NEXT: .LBB15_11: ; %frem.compute -; GFX1150-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| -; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1150-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX1150-NEXT: v_add_nc_u32_e32 v10, -1, v8 -; GFX1150-NEXT: v_readfirstlane_b32 s2, v8 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10 -; GFX1150-NEXT: s_cbranch_vccnz .LBB15_15 -; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body.preheader -; GFX1150-NEXT: s_add_i32 s2, s2, 25 -; GFX1150-NEXT: .LBB15_13: ; %frem.loop_body -; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1150-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 -; GFX1150-NEXT: s_sub_i32 s2, s2, 26 -; GFX1150-NEXT: s_cmp_gt_i32 s2, 26 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] -; GFX1150-NEXT: v_add_f64 v[6:7], v[8:9], -v[6:7] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] -; GFX1150-NEXT: v_add_f64 v[10:11], v[6:7], 1.0 -; GFX1150-NEXT: v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX1150-NEXT: s_cbranch_scc1 .LBB15_13 -; GFX1150-NEXT: ; %bb.14: ; %Flow -; GFX1150-NEXT: v_mov_b32_e32 v6, v8 -; GFX1150-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9 -; GFX1150-NEXT: .LBB15_15: ; %frem.loop_exit -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_subrev_nc_u32_e32 v8, 25, v10 -; GFX1150-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] -; GFX1150-NEXT: v_add_f64 v[6:7], v[6:7], -v[8:9] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] -; GFX1150-NEXT: v_add_f64 v[8:9], v[6:7], 1.0 -; GFX1150-NEXT: v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3 -; GFX1150-NEXT: .LBB15_16: ; %Flow49 -; GFX1150-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| -; GFX1150-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo -; GFX1150-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo -; GFX1150-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]| -; GFX1150-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7 -; GFX1150-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo -; GFX1150-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX1150-NEXT: s_endpgm -; -; GFX1200-LABEL: frem_v2f64_const_one_denum: -; GFX1200: ; %bb.0: -; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1200-NEXT: v_mov_b32_e32 v0, 0 -; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: global_load_b128 v[0:3], v0, s[2:3] -; GFX1200-NEXT: s_wait_loadcnt 0x0 -; GFX1200-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0 -; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2 -; GFX1200-NEXT: s_cbranch_vccz .LBB15_2 -; GFX1200-NEXT: ; %bb.1: ; %frem.else16 -; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0 -; GFX1200-NEXT: v_and_b32_e32 v4, 0x80000000, v1 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1200-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc_lo -; GFX1200-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc_lo -; GFX1200-NEXT: s_cbranch_execz .LBB15_3 -; GFX1200-NEXT: s_branch .LBB15_8 -; GFX1200-NEXT: .LBB15_2: -; GFX1200-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1200-NEXT: .LBB15_3: ; %frem.compute15 -; GFX1200-NEXT: v_frexp_mant_f64_e64 v[4:5], |v[0:1]| -; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v6, v[0:1] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX1200-NEXT: v_add_nc_u32_e32 v8, -1, v6 -; GFX1200-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v8 -; GFX1200-NEXT: s_cbranch_vccnz .LBB15_7 -; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body23.preheader -; GFX1200-NEXT: s_add_co_i32 s2, s2, 25 -; GFX1200-NEXT: .LBB15_5: ; %frem.loop_body23 -; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1200-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX1200-NEXT: s_sub_co_i32 s2, s2, 26 -; GFX1200-NEXT: s_cmp_gt_i32 s2, 26 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_rndne_f64_e32 v[4:5], v[6:7] -; GFX1200-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[4:5] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] -; GFX1200-NEXT: v_add_f64_e32 v[8:9], 1.0, v[4:5] -; GFX1200-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26 -; GFX1200-NEXT: s_cbranch_scc1 .LBB15_5 -; GFX1200-NEXT: ; %bb.6: ; %Flow50 -; GFX1200-NEXT: v_mov_b32_e32 v4, v6 -; GFX1200-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7 -; GFX1200-NEXT: .LBB15_7: ; %frem.loop_exit24 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_subrev_nc_u32_e32 v6, 25, v8 -; GFX1200-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_rndne_f64_e32 v[6:7], v[4:5] -; GFX1200-NEXT: v_add_f64_e64 v[4:5], v[4:5], -v[6:7] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5] -; GFX1200-NEXT: v_add_f64_e32 v[6:7], 1.0, v[4:5] -; GFX1200-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v1 -; GFX1200-NEXT: .LBB15_8: -; GFX1200-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0 -; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2 -; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX1200-NEXT: s_cbranch_vccz .LBB15_10 -; GFX1200-NEXT: ; %bb.9: ; %frem.else -; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0 -; GFX1200-NEXT: v_and_b32_e32 v6, 0x80000000, v3 -; GFX1200-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1200-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc_lo -; GFX1200-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc_lo -; GFX1200-NEXT: s_cbranch_execz .LBB15_11 -; GFX1200-NEXT: s_branch .LBB15_16 -; GFX1200-NEXT: .LBB15_10: -; GFX1200-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX1200-NEXT: .LBB15_11: ; %frem.compute -; GFX1200-NEXT: v_frexp_mant_f64_e64 v[6:7], |v[2:3]| -; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v8, v[2:3] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX1200-NEXT: v_add_nc_u32_e32 v10, -1, v8 -; GFX1200-NEXT: v_readfirstlane_b32 s2, v8 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v10 -; GFX1200-NEXT: s_cbranch_vccnz .LBB15_15 -; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body.preheader -; GFX1200-NEXT: s_add_co_i32 s2, s2, 25 -; GFX1200-NEXT: .LBB15_13: ; %frem.loop_body -; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 -; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX1200-NEXT: s_sub_co_i32 s2, s2, 26 -; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX1200-NEXT: s_cmp_gt_i32 s2, 26 -; GFX1200-NEXT: v_rndne_f64_e32 v[6:7], v[8:9] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_add_f64_e64 v[6:7], v[8:9], -v[6:7] -; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] -; GFX1200-NEXT: v_add_f64_e32 v[10:11], 1.0, v[6:7] -; GFX1200-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10 -; GFX1200-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26 -; GFX1200-NEXT: s_cbranch_scc1 .LBB15_13 -; GFX1200-NEXT: ; %bb.14: ; %Flow -; GFX1200-NEXT: v_mov_b32_e32 v6, v8 -; GFX1200-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9 -; GFX1200-NEXT: .LBB15_15: ; %frem.loop_exit -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_subrev_nc_u32_e32 v8, 25, v10 -; GFX1200-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_rndne_f64_e32 v[8:9], v[6:7] -; GFX1200-NEXT: v_add_f64_e64 v[6:7], v[6:7], -v[8:9] -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1200-NEXT: v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7] -; GFX1200-NEXT: v_add_f64_e32 v[8:9], 1.0, v[6:7] -; GFX1200-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-NEXT: v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v3 -; GFX1200-NEXT: .LBB15_16: ; %Flow49 -; GFX1200-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| -; GFX1200-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-NEXT: v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo -; GFX1200-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo -; GFX1200-NEXT: v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]| -; GFX1200-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7 -; GFX1200-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc_lo -; GFX1200-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX1200-NEXT: s_endpgm - %r0 = load <2 x double>, ptr addrspace(1) %in, align 16 - %r1 = frem <2 x double> %r0, - store <2 x double> %r1, ptr addrspace(1) %out, align 16 - ret void -} - -define amdgpu_kernel void @frem_v2f64_const(ptr addrspace(1) %out) #0 { -; SI-LABEL: frem_v2f64_const: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: v_mov_b32_e32 v3, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: s_endpgm -; -; CI-LABEL: frem_v2f64_const: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CI-NEXT: v_mov_b32_e32 v0, 0 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; CI-NEXT: v_mov_b32_e32 v2, v0 -; CI-NEXT: v_mov_b32_e32 v3, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; CI-NEXT: s_endpgm -; -; VI-LABEL: frem_v2f64_const: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; VI-NEXT: v_mov_b32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v3, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: frem_v2f64_const: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: frem_v2f64_const: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x3ff00000 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: frem_v2f64_const: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b128 v0, v[0:3], s[0:1] -; GFX11-NEXT: s_endpgm -; -; GFX1150-LABEL: frem_v2f64_const: -; GFX1150: ; %bb.0: -; GFX1150-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1150-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-NEXT: v_mov_b32_e32 v2, v0 -; GFX1150-NEXT: v_mov_b32_e32 v3, v0 -; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: global_store_b128 v0, v[0:3], s[0:1] -; GFX1150-NEXT: s_endpgm -; -; GFX1200-LABEL: frem_v2f64_const: -; GFX1200: ; %bb.0: -; GFX1200-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1200-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 -; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0 -; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: global_store_b128 v0, v[0:3], s[0:1] -; GFX1200-NEXT: s_endpgm - %r0 = frem <2 x double> , - store <2 x double> %r0, ptr addrspace(1) %out, align 16 - ret void -} - - - -attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 72c2003058a01..c77806e3e6ebc 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -6,6 +6,7 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11 + declare i32 @llvm.fshl.i32(i32, i32, i32) nounwind readnone declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone @@ -256,9 +257,9 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_and_b32 s2, s2, 31 ; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -397,9 +398,9 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; VI-NEXT: s_mov_b32 s3, s0 ; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 23 ; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 25 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -555,11 +556,11 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_and_b32 s0, s0, 31 ; VI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 -; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -768,11 +769,11 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; VI-NEXT: s_lshr_b64 s[4:5], s[14:15], 23 ; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 25 ; VI-NEXT: s_lshr_b64 s[8:9], s[12:13], 31 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index 7afb2cf317869..d62499f328ffb 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -9,6 +9,7 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX12,GFX12-TRUE16 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16 + declare i32 @llvm.fshr.i32(i32, i32, i32) declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>) @@ -222,106 +223,6 @@ entry: ret void } -define amdgpu_kernel void @fshr_i32_imm_src0(ptr addrspace(1) %in, i32 %x, i32 %y) { -; SI-LABEL: fshr_i32_imm_src0: -; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s9, 7 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s8, s3 -; SI-NEXT: s_and_b32 s0, s2, 31 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: s_endpgm -; -; VI-LABEL: fshr_i32_imm_src0: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s5, 7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s3 -; VI-NEXT: s_and_b32 s2, s2, 31 -; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fshr_i32_imm_src0: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s5, 7 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s3 -; GFX9-NEXT: s_and_b32 s2, s2, 31 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; R600-LABEL: fshr_i32_imm_src0: -; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 -; R600-NEXT: CF_END -; R600-NEXT: PAD -; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, -; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T1.X, literal.x, KC0[2].W, KC0[2].Z, -; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) -; -; GFX10-LABEL: fshr_i32_imm_src0: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: s_mov_b32 s5, 7 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s4, s3 -; GFX10-NEXT: s_and_b32 s2, s2, 31 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: fshr_i32_imm_src0: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s5, 7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s4, s3 -; GFX11-NEXT: s_and_b32 s2, s2, 31 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: fshr_i32_imm_src0: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s5, 7 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, s3 -; GFX12-NEXT: s_and_b32 s2, s2, 31 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_endpgm -entry: - %0 = call i32 @llvm.fshr.i32(i32 7, i32 %y, i32 %x) - store i32 %0, ptr addrspace(1) %in - ret void -} - define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshr_v2i32: ; SI: ; %bb.0: ; %entry @@ -356,9 +257,9 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; VI-NEXT: s_and_b32 s0, s6, 31 ; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 ; VI-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -489,9 +390,9 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; VI-NEXT: s_mov_b32 s3, s0 ; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 9 ; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 7 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -582,145 +483,6 @@ entry: ret void } -define amdgpu_kernel void @fshr_v2i32_imm_src1(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { -; SI-LABEL: fshr_v2i32_imm_src1: -; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s8, 9 -; SI-NEXT: s_mov_b32 s10, 7 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_and_b32 s1, s3, 31 -; SI-NEXT: s_mov_b32 s11, s0 -; SI-NEXT: s_and_b32 s0, s2, 31 -; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 -; SI-NEXT: s_lshr_b64 s[0:1], s[10:11], s0 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; SI-NEXT: s_endpgm -; -; VI-LABEL: fshr_v2i32_imm_src1: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s6, 9 -; VI-NEXT: s_mov_b32 s8, 7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s7, s1 -; VI-NEXT: s_and_b32 s1, s3, 31 -; VI-NEXT: s_mov_b32 s9, s0 -; VI-NEXT: s_and_b32 s0, s2, 31 -; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 -; VI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fshr_v2i32_imm_src1: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s4, 9 -; GFX9-NEXT: s_mov_b32 s8, 7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_and_b32 s1, s3, 31 -; GFX9-NEXT: s_mov_b32 s9, s0 -; GFX9-NEXT: s_and_b32 s0, s2, 31 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] -; GFX9-NEXT: s_endpgm -; -; R600-LABEL: fshr_v2i32_imm_src1: -; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; R600-NEXT: CF_END -; R600-NEXT: PAD -; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, literal.x, KC0[3].Z, -; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, literal.x, KC0[3].Y, -; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) -; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; -; GFX10-LABEL: fshr_v2i32_imm_src1: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX10-NEXT: s_mov_b32 s4, 9 -; GFX10-NEXT: s_mov_b32 s8, 7 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s5, s1 -; GFX10-NEXT: s_mov_b32 s9, s0 -; GFX10-NEXT: s_and_b32 s0, s2, 31 -; GFX10-NEXT: s_and_b32 s2, s3, 31 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: fshr_v2i32_imm_src1: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s6, 9 -; GFX11-NEXT: s_mov_b32 s8, 7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s7, s1 -; GFX11-NEXT: s_mov_b32 s9, s0 -; GFX11-NEXT: s_and_b32 s0, s2, 31 -; GFX11-NEXT: s_and_b32 s2, s3, 31 -; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 -; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: fshr_v2i32_imm_src1: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s6, 9 -; GFX12-NEXT: s_mov_b32 s8, 7 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s7, s1 -; GFX12-NEXT: s_mov_b32 s9, s0 -; GFX12-NEXT: s_and_b32 s0, s2, 31 -; GFX12-NEXT: s_and_b32 s2, s3, 31 -; GFX12-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 -; GFX12-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-NEXT: s_endpgm -entry: - %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> , <2 x i32> %y) - store <2 x i32> %0, ptr addrspace(1) %in - ret void -} - define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; SI-LABEL: fshr_v4i32: ; SI: ; %bb.0: ; %entry @@ -771,11 +533,11 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; VI-NEXT: s_lshr_b64 s[2:3], s[14:15], s2 ; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], s1 ; VI-NEXT: s_lshr_b64 s[0:1], s[12:13], s0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -956,11 +718,11 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; VI-NEXT: s_lshr_b64 s[4:5], s[14:15], 9 ; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 7 ; VI-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -1079,194 +841,6 @@ entry: ret void } -define amdgpu_kernel void @fshr_v4i32_imm_src0(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { -; SI-LABEL: fshr_v4i32_imm_src0: -; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 33 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, s11 -; SI-NEXT: s_and_b32 s4, s15, 31 -; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 -; SI-NEXT: s_mov_b32 s11, 9 -; SI-NEXT: s_and_b32 s5, s14, 31 -; SI-NEXT: s_lshr_b64 s[6:7], s[10:11], s5 -; SI-NEXT: s_mov_b32 s11, 7 -; SI-NEXT: s_mov_b32 s10, s9 -; SI-NEXT: s_and_b32 s5, s13, 31 -; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s5 -; SI-NEXT: s_mov_b32 s9, 1 -; SI-NEXT: s_and_b32 s5, s12, 31 -; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s5 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: s_endpgm -; -; VI-LABEL: fshr_v4i32_imm_src0: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; VI-NEXT: s_mov_b32 s1, 33 -; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s7, 7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s11 -; VI-NEXT: s_and_b32 s4, s15, 31 -; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 -; VI-NEXT: s_mov_b32 s11, 9 -; VI-NEXT: s_and_b32 s1, s14, 31 -; VI-NEXT: s_lshr_b64 s[4:5], s[10:11], s1 -; VI-NEXT: s_mov_b32 s6, s9 -; VI-NEXT: s_and_b32 s1, s13, 31 -; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 -; VI-NEXT: s_mov_b32 s9, 1 -; VI-NEXT: s_and_b32 s1, s12, 31 -; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: fshr_v4i32_imm_src0: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GFX9-NEXT: s_mov_b32 s1, 33 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 7 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s11 -; GFX9-NEXT: s_and_b32 s4, s15, 31 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 -; GFX9-NEXT: s_mov_b32 s11, 9 -; GFX9-NEXT: s_and_b32 s1, s14, 31 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[10:11], s1 -; GFX9-NEXT: s_mov_b32 s6, s9 -; GFX9-NEXT: s_and_b32 s1, s13, 31 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], s1 -; GFX9-NEXT: s_mov_b32 s9, 1 -; GFX9-NEXT: s_and_b32 s1, s12, 31 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] -; GFX9-NEXT: s_endpgm -; -; R600-LABEL: fshr_v4i32_imm_src0: -; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 -; R600-NEXT: CF_END -; R600-NEXT: PAD -; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: BIT_ALIGN_INT * T0.W, literal.x, KC0[4].X, KC0[5].X, -; R600-NEXT: 33(4.624285e-44), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T0.Z, literal.x, KC0[3].W, KC0[4].W, -; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T0.Y, literal.x, KC0[3].Z, KC0[4].Z, -; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T0.X, 1, KC0[3].Y, KC0[4].Y, -; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; -; GFX10-LABEL: fshr_v4i32_imm_src0: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX10-NEXT: s_mov_b32 s1, 33 -; GFX10-NEXT: s_mov_b32 s3, 7 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s11 -; GFX10-NEXT: s_and_b32 s4, s15, 31 -; GFX10-NEXT: s_mov_b32 s11, 9 -; GFX10-NEXT: s_and_b32 s5, s14, 31 -; GFX10-NEXT: s_mov_b32 s2, s9 -; GFX10-NEXT: s_and_b32 s13, s13, 31 -; GFX10-NEXT: s_mov_b32 s9, 1 -; GFX10-NEXT: s_and_b32 s12, s12, 31 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[10:11], s5 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: fshr_v4i32_imm_src0: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s1, 33 -; GFX11-NEXT: s_mov_b32 s3, 7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, s11 -; GFX11-NEXT: s_and_b32 s6, s15, 31 -; GFX11-NEXT: s_mov_b32 s11, 9 -; GFX11-NEXT: s_and_b32 s7, s14, 31 -; GFX11-NEXT: s_mov_b32 s2, s9 -; GFX11-NEXT: s_and_b32 s13, s13, 31 -; GFX11-NEXT: s_mov_b32 s9, 1 -; GFX11-NEXT: s_and_b32 s12, s12, 31 -; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s6 -; GFX11-NEXT: s_lshr_b64 s[6:7], s[10:11], s7 -; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 -; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: fshr_v4i32_imm_src0: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s1, 33 -; GFX12-NEXT: s_mov_b32 s3, 7 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s0, s11 -; GFX12-NEXT: s_and_b32 s6, s15, 31 -; GFX12-NEXT: s_mov_b32 s11, 9 -; GFX12-NEXT: s_and_b32 s7, s14, 31 -; GFX12-NEXT: s_mov_b32 s2, s9 -; GFX12-NEXT: s_and_b32 s13, s13, 31 -; GFX12-NEXT: s_mov_b32 s9, 1 -; GFX12-NEXT: s_and_b32 s12, s12, 31 -; GFX12-NEXT: s_lshr_b64 s[0:1], s[0:1], s6 -; GFX12-NEXT: s_lshr_b64 s[6:7], s[10:11], s7 -; GFX12-NEXT: s_lshr_b64 s[8:9], s[8:9], s12 -; GFX12-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s0 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] -; GFX12-NEXT: s_endpgm -entry: - %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> , <4 x i32> %x, <4 x i32> %y) - store <4 x i32> %0, ptr addrspace(1) %in - ret void -} - define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) { ; GFX89-LABEL: v_fshr_i32: ; GFX89: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index c06011c259f9b..93bce89baa34f 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -1990,10 +1990,10 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s3, s0 ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:2032 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 21e5994819997..851dbf34f65f5 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -7583,9 +7583,9 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v0, v8 ; GFX7-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v3, v11 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7618,9 +7618,9 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v0, v8 ; GFX6-NEXT: v_mov_b32_e32 v1, v9 -; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v3, v11 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7817,9 +7817,9 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v0, v8 ; GFX7-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v3, v11 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7852,9 +7852,9 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v0, v8 ; GFX6-NEXT: v_mov_b32_e32 v1, v9 -; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v3, v11 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8054,9 +8054,9 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v3, v9 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[10:11], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8091,9 +8091,9 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v0, v6 ; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v3, v9 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[10:11], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9739,11 +9739,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9784,11 +9784,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10263,11 +10263,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10308,11 +10308,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10759,11 +10759,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10801,11 +10801,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -11260,11 +11260,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -11303,11 +11303,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -11766,11 +11766,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -11809,11 +11809,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -12160,10 +12160,10 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12195,10 +12195,10 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -12533,10 +12533,10 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v5, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -12567,10 +12567,10 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v3 ; GFX6-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -13044,11 +13044,11 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -13089,11 +13089,11 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -13555,11 +13555,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -13598,11 +13598,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -19568,9 +19568,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -19618,9 +19618,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -19812,9 +19812,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -19862,9 +19862,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -20061,9 +20061,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -20113,9 +20113,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -20986,9 +20986,9 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -21036,9 +21036,9 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -21471,9 +21471,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -21521,9 +21521,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -21961,9 +21961,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -22011,9 +22011,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -22439,9 +22439,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -22489,9 +22489,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index dbd48d2a7cf8f..92642943ddec9 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s + ; -------------------------------------------------------------------- ; float ; -------------------------------------------------------------------- @@ -4210,11 +4211,11 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_max_f64 v[6:7], v[0:1], v[10:11] +; GFX7-NEXT: v_mov_b32_e32 v3, v9 ; GFX7-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 -; GFX7-NEXT: v_mov_b32_e32 v3, v9 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -4244,11 +4245,11 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_max_f64 v[6:7], v[0:1], v[10:11] +; GFX6-NEXT: v_mov_b32_e32 v3, v9 ; GFX6-NEXT: v_mov_b32_e32 v0, v6 ; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 -; GFX6-NEXT: v_mov_b32_e32 v3, v9 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -5276,11 +5277,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5321,11 +5322,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -5738,11 +5739,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5783,11 +5784,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6172,11 +6173,11 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6214,11 +6215,11 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_max_f32_e32 v3, v3, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6616,11 +6617,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6659,11 +6660,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7063,11 +7064,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7106,11 +7107,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7414,10 +7415,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7449,10 +7450,10 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7747,10 +7748,10 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v5, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7781,10 +7782,10 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v3 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8198,11 +8199,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8243,11 +8244,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8652,11 +8653,11 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8695,11 +8696,11 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -13836,9 +13837,9 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -13886,9 +13887,9 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -14130,9 +14131,9 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -14180,9 +14181,9 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -14429,9 +14430,9 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -14481,9 +14482,9 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -15587,9 +15588,9 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -15637,9 +15638,9 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 7930ad8a15404..c4647bf10545f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s + ; -------------------------------------------------------------------- ; float ; -------------------------------------------------------------------- @@ -4210,11 +4211,11 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_min_f64 v[6:7], v[0:1], v[10:11] +; GFX7-NEXT: v_mov_b32_e32 v3, v9 ; GFX7-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 -; GFX7-NEXT: v_mov_b32_e32 v3, v9 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -4244,11 +4245,11 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_min_f64 v[6:7], v[0:1], v[10:11] +; GFX6-NEXT: v_mov_b32_e32 v3, v9 ; GFX6-NEXT: v_mov_b32_e32 v0, v6 ; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 -; GFX6-NEXT: v_mov_b32_e32 v3, v9 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 @@ -5276,11 +5277,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5321,11 +5322,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -5738,11 +5739,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -5783,11 +5784,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6172,11 +6173,11 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6214,11 +6215,11 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_min_f32_e32 v3, v3, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6616,11 +6617,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6659,11 +6660,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7063,11 +7064,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7106,11 +7107,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7414,10 +7415,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7449,10 +7450,10 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7747,10 +7748,10 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v5, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7781,10 +7782,10 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v3 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8198,11 +8199,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8243,11 +8244,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8652,11 +8653,11 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8695,11 +8696,11 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -13836,9 +13837,9 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -13886,9 +13887,9 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -14130,9 +14131,9 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -14180,9 +14181,9 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -14429,9 +14430,9 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -14481,9 +14482,9 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -15587,9 +15588,9 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -15637,9 +15638,9 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index b79d0df960a0f..b81e4872a2057 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s + ; -------------------------------------------------------------------- ; float ; -------------------------------------------------------------------- @@ -3921,9 +3922,9 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], -v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v0, v8 ; GFX7-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v3, v11 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3956,9 +3957,9 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], -v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v0, v8 ; GFX6-NEXT: v_mov_b32_e32 v1, v9 -; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v3, v11 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4173,9 +4174,9 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], -v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v0, v8 ; GFX7-NEXT: v_mov_b32_e32 v1, v9 -; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: v_mov_b32_e32 v3, v11 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4208,9 +4209,9 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], -v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v0, v8 ; GFX6-NEXT: v_mov_b32_e32 v1, v9 -; GFX6-NEXT: v_mov_b32_e32 v2, v10 ; GFX6-NEXT: v_mov_b32_e32 v3, v11 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4428,9 +4429,9 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-NEXT: v_mov_b32_e32 v2, v8 ; GFX7-NEXT: v_mov_b32_e32 v3, v9 ; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[10:11], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4465,9 +4466,9 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v0, v6 ; GFX6-NEXT: v_mov_b32_e32 v1, v7 -; GFX6-NEXT: v_mov_b32_e32 v2, v8 ; GFX6-NEXT: v_mov_b32_e32 v3, v9 ; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[10:11], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -5983,11 +5984,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6028,11 +6029,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_sub_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6420,11 +6421,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6465,11 +6466,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_sub_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6834,11 +6835,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_sub_f32_e32 v3, v3, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6876,11 +6877,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_sub_f32_e32 v3, v3, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7253,11 +7254,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7296,11 +7297,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_sub_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7675,11 +7676,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7718,11 +7719,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_sub_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8006,10 +8007,10 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8041,10 +8042,10 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_sub_f32_e32 v2, v2, v6 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8319,10 +8320,10 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_mov_b32_e32 v6, v3 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_mov_b32_e32 v6, v3 ; GFX7-NEXT: v_mov_b32_e32 v5, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8353,10 +8354,10 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_mov_b32_e32 v6, v3 ; GFX6-NEXT: v_sub_f32_e32 v2, v2, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, v3 ; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -8745,11 +8746,11 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v7 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8790,11 +8791,11 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, v3, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_sub_f32_e32 v2, v2, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -9174,11 +9175,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_sub_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_mov_b32_e32 v8, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9217,11 +9218,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v7, v3, v6 +; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_sub_f32_e32 v2, v2, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, v3 ; GFX6-NEXT: v_mov_b32_e32 v7, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -14321,9 +14322,9 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -14371,9 +14372,9 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -14598,9 +14599,9 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -14648,9 +14649,9 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -14880,9 +14881,9 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -14932,9 +14933,9 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v0 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -15964,9 +15965,9 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX7-NEXT: v_mov_b32_e32 v9, v7 ; GFX7-NEXT: v_mov_b32_e32 v8, v6 ; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -16014,9 +16015,9 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7 ; GFX6-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v6, v8, v2 -; GFX6-NEXT: v_mov_b32_e32 v9, v7 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 ; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 890ebddf36801..0524b284a3d05 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -4793,8 +4793,8 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX12-GISEL: ; %bb.0: ; %bb ; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-GISEL-NEXT: .LBB132_1: ; %bb3 ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4913,8 +4913,8 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX12-GISEL: ; %bb.0: ; %bb ; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-GISEL-NEXT: .LBB133_1: ; %bb3 ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index da132d0269e6b..8158d28181ab0 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -5321,9 +5321,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5345,9 +5345,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; GFX11-LABEL: atomic_cmpxchg_i32_offset: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv @@ -5387,10 +5387,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out, ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s2 -; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_mov_b32 s2, s6 ; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5417,9 +5417,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out, ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -5467,8 +5467,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_addc_u32 s1, s3, s1 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5556,8 +5556,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) ; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5638,9 +5638,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5662,9 +5662,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; GFX11-LABEL: atomic_cmpxchg_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl1_inv @@ -5703,8 +5703,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5733,9 +5733,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv @@ -5780,8 +5780,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s2, s0 ; VI-NEXT: s_addc_u32 s1, s3, s1 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5866,8 +5866,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_add_u32 s0, s0, s4 ; VI-NEXT: s_addc_u32 s1, s1, s5 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index fb5e669d680f5..59de06b9c53bf 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -1812,8 +1812,8 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB36_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2597,8 +2597,8 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB47_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3364,8 +3364,8 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB57_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4208,8 +4208,8 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %p ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB67_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4993,8 +4993,8 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB78_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5778,8 +5778,8 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB89_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5958,8 +5958,8 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_load_dword s3, s[4:5], 0x10 ; VI-NEXT: s_add_u32 s4, s4, 16 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -6064,8 +6064,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_load_dword s7, s[4:5], 0x10 ; VI-NEXT: s_add_u32 s4, s4, 16 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -6169,8 +6169,8 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; VI-NEXT: s_add_u32 s4, s0, s4 ; VI-NEXT: s_addc_u32 s5, s1, s5 ; VI-NEXT: s_load_dword s3, s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -6272,8 +6272,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_add_u32 s4, s0, s4 ; VI-NEXT: s_addc_u32 s5, s1, s5 ; VI-NEXT: s_load_dword s7, s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 @@ -6946,8 +6946,8 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB103_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7126,8 +7126,8 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: s_load_dword s3, s[4:5], 0x10 ; VI-NEXT: s_add_u32 s4, s4, 16 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -7232,8 +7232,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: s_load_dword s7, s[4:5], 0x10 ; VI-NEXT: s_add_u32 s4, s4, 16 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -7345,8 +7345,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_add_u32 s4, s0, s4 ; VI-NEXT: s_addc_u32 s5, s1, s5 ; VI-NEXT: s_load_dword s7, s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 @@ -8019,8 +8019,8 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB116_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8767,8 +8767,8 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB126_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8947,8 +8947,8 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_load_dword s3, s[4:5], 0x10 ; VI-NEXT: s_add_u32 s4, s4, 16 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -9053,8 +9053,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_load_dword s7, s[4:5], 0x10 ; VI-NEXT: s_add_u32 s4, s4, 16 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -9248,8 +9248,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_add_u32 s4, s0, s4 ; VI-NEXT: s_addc_u32 s5, s1, s5 ; VI-NEXT: s_load_dword s7, s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s7 @@ -9960,8 +9960,8 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) i ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB140_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10579,9 +10579,9 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v1, v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB148_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10784,9 +10784,9 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[36:37], 0 ; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: .LBB150_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll index daa053ceea161..022b226aa7704 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -140,10 +140,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -217,11 +217,11 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -428,10 +428,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -502,11 +502,11 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -708,10 +708,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -785,11 +785,11 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -996,10 +996,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1070,11 +1070,11 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1276,10 +1276,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1353,11 +1353,11 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1564,10 +1564,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1638,11 +1638,11 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1835,10 +1835,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_endpgm @@ -1906,11 +1906,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2105,10 +2105,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm @@ -2173,11 +2173,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2367,10 +2367,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_endpgm @@ -2438,11 +2438,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2637,10 +2637,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm @@ -2705,11 +2705,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2899,10 +2899,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_endpgm @@ -2970,11 +2970,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3169,10 +3169,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm @@ -3237,11 +3237,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3431,10 +3431,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_endpgm @@ -3502,11 +3502,11 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3701,10 +3701,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm @@ -3769,11 +3769,11 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3972,10 +3972,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4049,11 +4049,11 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4260,10 +4260,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4334,11 +4334,11 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4650,10 +4650,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4727,11 +4727,11 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4938,10 +4938,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -5012,11 +5012,11 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -5218,10 +5218,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -5295,11 +5295,11 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -5506,10 +5506,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[0:3], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -5580,11 +5580,11 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -5878,12 +5878,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: s_mov_b64 s[8:9], s[0:1] ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_mov_b32_e32 v2, s6 ; CI-NEXT: v_mov_b32_e32 v3, s7 +; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_mov_b32_e32 v5, s5 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[8:11], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -5899,11 +5899,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_addc_u32 s1, s1, s5 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5957,7 +5957,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshl_b64 s[6:7], s[14:15], 3 -; CI-NEXT: v_mov_b32_e32 v4, s6 ; CI-NEXT: s_mov_b32 s0, s10 ; CI-NEXT: s_mov_b32 s1, s11 ; CI-NEXT: s_mov_b32 s10, 0 @@ -5966,6 +5965,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; CI-NEXT: v_mov_b32_e32 v1, s13 ; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: v_mov_b32_e32 v4, s6 ; CI-NEXT: v_mov_b32_e32 v5, s7 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[8:11], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -5984,10 +5984,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) ; VI-NEXT: s_addc_u32 s3, s9, s3 ; VI-NEXT: s_add_u32 s2, s0, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6198,11 +6198,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[8:9], s[0:1] ; CI-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_mov_b32_e32 v2, s6 ; CI-NEXT: v_mov_b32_e32 v3, s7 +; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[8:11], 0 addr64 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -6216,11 +6216,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 ; VI-NEXT: s_add_u32 s0, s0, s4 ; VI-NEXT: s_addc_u32 s1, s1, s5 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6273,7 +6273,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshl_b64 s[6:7], s[14:15], 3 -; CI-NEXT: v_mov_b32_e32 v4, s6 ; CI-NEXT: s_mov_b32 s0, s10 ; CI-NEXT: s_mov_b32 s1, s11 ; CI-NEXT: s_mov_b32 s10, 0 @@ -6282,6 +6281,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v1, s13 ; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: v_mov_b32_e32 v4, s6 ; CI-NEXT: v_mov_b32_e32 v5, s7 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[8:11], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -6297,11 +6297,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ; VI-NEXT: s_lshl_b64 s[2:3], s[14:15], 3 ; VI-NEXT: s_add_u32 s2, s8, s2 ; VI-NEXT: s_addc_u32 s3, s9, s3 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6555,10 +6555,10 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s4, s2 ; CI-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 -; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -6636,10 +6636,10 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s4, s2 ; CI-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 -; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -6714,10 +6714,10 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s4, s2 ; CI-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 -; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: s_mov_b32 s5, s3 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: s_mov_b32 s3, s7 +; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -6899,8 +6899,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 ; CI-NEXT: s_endpgm @@ -6969,8 +6969,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_lshl_b64 s[0:1], s[8:9], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm @@ -7036,8 +7036,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_lshl_b64 s[0:1], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 ; CI-NEXT: s_endpgm @@ -7231,10 +7231,10 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_inc_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -7434,10 +7434,10 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: buffer_atomic_dec_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index 6a4c2849ba4a3..66edbec65b56b 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s + ; --------------------------------------------------------------------- ; atomicrmw xchg ; --------------------------------------------------------------------- @@ -234,9 +235,9 @@ define amdgpu_gfx void @global_atomic_xchg_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -343,9 +344,9 @@ define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -672,9 +673,9 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -781,9 +782,9 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_offset_scalar(ptr addrspace ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1110,9 +1111,9 @@ define amdgpu_gfx void @global_atomic_add_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1219,9 +1220,9 @@ define amdgpu_gfx i64 @global_atomic_add_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s34 ; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1536,8 +1537,8 @@ define i64 @global_atomic_sub_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: .LBB32_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 ; VI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -1561,8 +1562,8 @@ define i64 @global_atomic_sub_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc @@ -1629,8 +1630,8 @@ define i64 @global_atomic_sub_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: .LBB33_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 ; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_mov_b32_e32 v9, v1 ; VI-NEXT: v_sub_u32_e32 v6, vcc, v8, v2 ; VI-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc @@ -1652,8 +1653,8 @@ define i64 @global_atomic_sub_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc @@ -1725,9 +1726,9 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB34_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1929,15 +1930,15 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB36_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v8, v1 ; VI-NEXT: v_mov_b32_e32 v7, v0 +; VI-NEXT: v_mov_b32_e32 v8, v1 ; VI-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 ; VI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -1961,8 +1962,8 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v6 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[4:7], s[4:5] glc @@ -2037,8 +2038,8 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: .LBB37_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v8, v1 ; VI-NEXT: v_mov_b32_e32 v7, v0 +; VI-NEXT: v_mov_b32_e32 v8, v1 ; VI-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 ; VI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc @@ -2062,8 +2063,8 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v6 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[4:7], s[4:5] offset:32 glc @@ -2562,8 +2563,8 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB44_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2760,8 +2761,8 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB46_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3412,8 +3413,8 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB54_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3622,8 +3623,8 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB56_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4375,8 +4376,8 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB64_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4573,8 +4574,8 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %p ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB66_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5201,8 +5202,8 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB74_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5399,8 +5400,8 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB76_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6042,10 +6043,10 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB84_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6259,10 +6260,10 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB86_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6474,12 +6475,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v6, s3 +; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: .LBB88_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6593,12 +6594,12 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: .LBB89_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6712,10 +6713,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; VI-NEXT: s_add_u32 s4, s0, s4 ; VI-NEXT: s_addc_u32 s5, s1, s5 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: v_mov_b32_e32 v6, s3 ; VI-NEXT: v_mov_b32_e32 v7, s2 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -6828,10 +6829,10 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_add_u32 s6, s0, s6 ; VI-NEXT: s_addc_u32 s7, s1, s7 ; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: v_mov_b32_e32 v3, s9 @@ -7394,10 +7395,10 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB98_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7611,10 +7612,10 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB100_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7826,12 +7827,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v6, s3 +; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: .LBB102_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7945,12 +7946,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: .LBB103_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8068,10 +8069,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_add_u32 s6, s0, s6 ; VI-NEXT: s_addc_u32 s7, s1, s7 ; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: v_mov_b32_e32 v3, s9 @@ -8634,10 +8635,10 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB111_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8851,10 +8852,10 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB113_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9510,10 +9511,10 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB121_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9727,10 +9728,10 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB123_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9942,12 +9943,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v6, s3 +; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: .LBB125_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10061,12 +10062,12 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: .LBB126_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10175,9 +10176,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v6, s3 ; VI-NEXT: v_mov_b32_e32 v7, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -10285,10 +10286,10 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: s_add_u32 s6, s0, s6 ; VI-NEXT: s_addc_u32 s7, s1, s7 ; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: v_mov_b32_e32 v3, s9 @@ -10668,8 +10669,8 @@ define i64 @global_atomic_uinc_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: .LBB133_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] @@ -10696,8 +10697,8 @@ define i64 @global_atomic_uinc_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: .LBB133_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] @@ -10769,8 +10770,8 @@ define i64 @global_atomic_uinc_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i ; VI-NEXT: .LBB134_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v1 ; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_mov_b32_e32 v9, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc ; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] @@ -10795,8 +10796,8 @@ define i64 @global_atomic_uinc_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i ; GFX9-NEXT: .LBB134_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] @@ -10873,8 +10874,8 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB135_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11089,14 +11090,14 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB137_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v1 ; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_mov_b32_e32 v7, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -11122,8 +11123,8 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i ; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[5:6] @@ -11202,8 +11203,8 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa ; VI-NEXT: .LBB138_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, v1 ; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_mov_b32_e32 v7, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc ; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] @@ -11229,8 +11230,8 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa ; GFX9-NEXT: .LBB138_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[5:6] @@ -11797,10 +11798,10 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[38:39], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: .LBB145_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12037,10 +12038,10 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) i ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[38:39], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: .LBB147_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index a6a886dc321ce..70fe85b1c1ebb 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -452,14 +452,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -793,14 +793,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1665,14 +1665,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2006,14 +2006,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2938,14 +2938,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3279,14 +3279,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3707,14 +3707,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4048,14 +4048,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4979,14 +4979,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5346,14 +5346,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5759,8 +5759,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -5792,10 +5792,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -5826,10 +5826,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -5973,8 +5973,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -6006,10 +6006,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -6040,10 +6040,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -6236,14 +6236,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6620,14 +6620,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -7663,14 +7663,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -8047,14 +8047,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9090,14 +9090,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9474,14 +9474,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9999,14 +9999,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -10383,14 +10383,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -11426,14 +11426,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -11810,14 +11810,14 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index e62d6c593215b..a6b4679dbfb8b 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -364,14 +364,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -696,14 +696,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1397,14 +1397,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1729,14 +1729,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2430,14 +2430,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2762,14 +2762,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3549,14 +3549,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3929,14 +3929,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4813,14 +4813,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5193,14 +5193,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6077,14 +6077,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6457,14 +6457,14 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 1c14ff65dcbb6..10e4448c88797 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -364,14 +364,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -696,14 +696,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1397,14 +1397,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1729,14 +1729,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2430,14 +2430,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2762,14 +2762,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3549,14 +3549,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3929,14 +3929,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -4813,14 +4813,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5193,14 +5193,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6077,14 +6077,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6457,14 +6457,14 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index b97c3cdf32d12..7f689ec0e4ed8 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -512,14 +512,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -879,14 +879,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -1837,14 +1837,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2204,14 +2204,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3162,14 +3162,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3529,14 +3529,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3983,14 +3983,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -4350,14 +4350,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -5307,14 +5307,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5674,14 +5674,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6087,8 +6087,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -6120,10 +6120,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -6154,10 +6154,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -6301,8 +6301,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX9-DPP-NEXT: ; %bb.1: ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -6334,10 +6334,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1064-DPP-NEXT: ; %bb.1: -; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -6368,10 +6368,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: -; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 @@ -6564,14 +6564,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6948,14 +6948,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -7990,14 +7990,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -8374,14 +8374,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -9417,14 +9417,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -9801,14 +9801,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -10326,14 +10326,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -10710,14 +10710,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -11752,14 +11752,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -12136,14 +12136,14 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-DPP-NEXT: s_mov_b32 s14, s10 -; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-DPP-NEXT: s_mov_b32 s32, 0 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir index da1175c02e94a..965c31970404f 100644 --- a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir @@ -86,7 +86,7 @@ body: | ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr64_sgpr65:0x000000000000000F ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99, 4398046511103 ; CHECK-NEXT: renamable $sgpr6 = S_LSHL_B32 renamable $sgpr65, 1, implicit-def dead $scc ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:vreg_1024 = V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 [[COPY]], 0, killed $sgpr6, 3, implicit-def $m0, implicit $m0, implicit $exec ; CHECK-NEXT: {{ $}} @@ -117,7 +117,7 @@ body: | ; CHECK-NEXT: renamable $sgpr55 = COPY renamable $sgpr68 ; CHECK-NEXT: renamable $sgpr56 = COPY renamable $sgpr68 ; CHECK-NEXT: renamable $sgpr57 = COPY killed renamable $sgpr68 - ; CHECK-NEXT: dead [[COPY1:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, implicit $exec + ; CHECK-NEXT: dead [[COPY1:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, 17592186044415, implicit $exec ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, killed renamable $sgpr6_sgpr7, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 8e427a6ef2023..c6c021e6e89d5 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -100,10 +100,10 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_add_u32 s4, s0, 4 ; CIVI-NEXT: s_addc_u32 s5, s1, 0 -; CIVI-NEXT: v_mov_b32_e32 v2, s4 ; CIVI-NEXT: v_mov_b32_e32 v4, s3 -; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v2, s4 ; CIVI-NEXT: v_mov_b32_e32 v3, s5 +; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 ; CIVI-NEXT: v_mov_b32_e32 v5, s2 ; CIVI-NEXT: flat_store_short v[2:3], v4 @@ -135,8 +135,8 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: v_mov_b32_e32 v2, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: v_mov_b32_e32 v2, s2 ; CIVI-NEXT: v_mov_b32_e32 v3, s3 ; CIVI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CIVI-NEXT: s_endpgm @@ -144,9 +144,9 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; GFX11-LABEL: load_v4f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm store <4 x half> %arg, ptr addrspace(1) %out @@ -163,8 +163,8 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v4, s4 -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v5, s5 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 @@ -180,8 +180,8 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -193,10 +193,10 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_endpgm store <8 x half> %arg, ptr addrspace(1) %out @@ -461,19 +461,19 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s1, 16 -; CI-NEXT: s_lshr_b32 s7, s0, 16 ; CI-NEXT: s_lshr_b32 s8, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s6 ; CI-NEXT: s_lshr_b32 s6, s2, 16 +; CI-NEXT: s_lshr_b32 s7, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v7, s8 ; CI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: v_cvt_f32_f16_e32 v6, s3 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; CI-NEXT: s_add_u32 s0, s4, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: s_add_u32 s0, s4, 16 ; CI-NEXT: s_addc_u32 s1, s5, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; CI-NEXT: v_mov_b32_e32 v9, s1 ; CI-NEXT: v_mov_b32_e32 v8, s0 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] @@ -492,19 +492,19 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s1, 16 -; VI-NEXT: s_lshr_b32 s7, s0, 16 ; VI-NEXT: s_lshr_b32 s8, s3, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v3, s6 ; VI-NEXT: s_lshr_b32 s6, s2, 16 +; VI-NEXT: s_lshr_b32 s7, s0, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v7, s8 ; VI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; VI-NEXT: v_cvt_f32_f16_e32 v6, s3 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; VI-NEXT: s_add_u32 s0, s4, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s4, 16 ; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_mov_b32_e32 v8, s0 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] @@ -659,15 +659,15 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; CI-NEXT: s_lshr_b32 s4, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s2 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v0 -; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_mov_b32_e32 v7, s3 ; CI-NEXT: v_mov_b32_e32 v6, s2 ; CI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] @@ -683,15 +683,15 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 ; VI-NEXT: s_lshr_b32 s4, s2, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 -; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] @@ -732,17 +732,17 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 +; CI-NEXT: s_lshr_b32 s5, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; CI-NEXT: s_lshr_b32 s5, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s2 ; CI-NEXT: v_cvt_f32_f16_e32 v6, s5 -; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_mov_b32_e32 v9, s3 ; CI-NEXT: v_mov_b32_e32 v8, s2 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -760,17 +760,17 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s3, 16 +; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s2 ; VI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v9, s3 ; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -820,32 +820,33 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; CI-NEXT: v_cvt_f32_f16_e32 v12, s3 ; CI-NEXT: s_lshr_b32 s7, s2, 16 ; CI-NEXT: s_lshr_b32 s8, s1, 16 -; CI-NEXT: s_lshr_b32 s6, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; CI-NEXT: v_cvt_f32_f16_e32 v8, s2 ; CI-NEXT: v_cvt_f32_f16_e32 v9, s0 -; CI-NEXT: s_add_u32 s0, s4, 48 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; CI-NEXT: s_lshr_b32 s6, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v0 ; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 +; CI-NEXT: s_add_u32 s0, s4, 48 ; CI-NEXT: s_addc_u32 s1, s5, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; CI-NEXT: v_mov_b32_e32 v17, s1 -; CI-NEXT: v_mov_b32_e32 v16, s0 -; CI-NEXT: s_add_u32 s0, s4, 32 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v1 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; CI-NEXT: s_addc_u32 s1, s5, 0 -; CI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; CI-NEXT: v_mov_b32_e32 v16, s0 +; CI-NEXT: s_add_u32 s0, s4, 32 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 -; CI-NEXT: v_mov_b32_e32 v13, s1 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; CI-NEXT: v_mov_b32_e32 v17, s1 +; CI-NEXT: s_addc_u32 s1, s5, 0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; CI-NEXT: s_nop 0 ; CI-NEXT: v_mov_b32_e32 v12, s0 ; CI-NEXT: s_add_u32 s0, s4, 16 +; CI-NEXT: v_mov_b32_e32 v13, s1 ; CI-NEXT: s_addc_u32 s1, s5, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; CI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; CI-NEXT: s_nop 0 ; CI-NEXT: v_mov_b32_e32 v9, s1 @@ -865,37 +866,38 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s6, s0, 16 ; VI-NEXT: s_lshr_b32 s8, s2, 16 +; VI-NEXT: s_lshr_b32 s6, s0, 16 ; VI-NEXT: s_lshr_b32 s9, s3, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s6 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s6 ; VI-NEXT: v_cvt_f32_f16_e32 v5, s9 ; VI-NEXT: v_cvt_f32_f16_e32 v12, s3 ; VI-NEXT: s_lshr_b32 s7, s1, 16 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; VI-NEXT: v_cvt_f32_f16_e32 v8, s2 -; VI-NEXT: s_add_u32 s0, s4, 48 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v4 -; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v5 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s1 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 +; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v5 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 +; VI-NEXT: s_add_u32 s0, s4, 48 ; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v17, s1 +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 ; VI-NEXT: v_mov_b32_e32 v16, s0 ; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v1 -; VI-NEXT: v_mov_b32_e32 v13, s1 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; VI-NEXT: v_mov_b32_e32 v17, s1 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-NEXT: s_nop 0 ; VI-NEXT: v_mov_b32_e32 v12, s0 ; VI-NEXT: s_add_u32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v13, s1 ; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-NEXT: s_nop 0 ; VI-NEXT: v_mov_b32_e32 v9, s1 @@ -1477,10 +1479,10 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 16 -; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: s_addc_u32 s5, s3, 0 -; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -1490,6 +1492,7 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; CI-NEXT: v_mov_b32_e32 v13, s2 ; CI-NEXT: s_add_u32 s2, s0, 48 ; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v15, s3 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f32_f16_e32 v8, v1 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1511,19 +1514,18 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: s_add_u32 s0, s0, 32 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_mov_b32_e32 v15, s3 -; CI-NEXT: v_mov_b32_e32 v17, s1 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_mov_b32_e32 v17, s1 ; CI-NEXT: v_mov_b32_e32 v16, s0 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] @@ -1550,10 +1552,10 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; VI-NEXT: v_mov_b32_e32 v19, s3 ; VI-NEXT: v_mov_b32_e32 v18, s2 ; VI-NEXT: s_add_u32 s2, s0, 48 -; VI-NEXT: v_mov_b32_e32 v17, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v16, s0 ; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: v_mov_b32_e32 v17, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v21, s3 ; VI-NEXT: v_mov_b32_e32 v20, s2 @@ -1842,6 +1844,7 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v9, s3 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1851,7 +1854,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; VI-NEXT: v_mov_b32_e32 v9, s3 ; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: flat_store_dwordx2 v[8:9], v[6:7] ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1917,6 +1919,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v11, s3 ; CI-NEXT: v_mov_b32_e32 v9, s1 ; CI-NEXT: v_mov_b32_e32 v8, s0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1930,7 +1933,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v2 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10 -; CI-NEXT: v_mov_b32_e32 v11, s3 ; CI-NEXT: v_mov_b32_e32 v10, s2 ; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -1948,6 +1950,7 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v11, s3 ; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_mov_b32_e32 v8, s0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1959,7 +1962,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10 -; VI-NEXT: v_mov_b32_e32 v11, s3 ; VI-NEXT: v_mov_b32_e32 v10, s2 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -2036,13 +2038,13 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v7, s3 ; CI-NEXT: v_mov_b32_e32 v6, s2 ; CI-NEXT: s_add_u32 s2, s0, 32 -; CI-NEXT: v_mov_b32_e32 v13, s1 ; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_mov_b32_e32 v12, s0 ; CI-NEXT: s_add_u32 s0, s0, 16 +; CI-NEXT: v_mov_b32_e32 v13, s1 ; CI-NEXT: v_mov_b32_e32 v15, s3 -; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 @@ -2087,13 +2089,13 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v8, s3 ; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 -; VI-NEXT: v_mov_b32_e32 v13, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v12, s0 ; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: v_mov_b32_e32 v13, s1 ; VI-NEXT: v_mov_b32_e32 v15, s3 -; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v14, s2 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 ; VI-NEXT: v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -2220,31 +2222,30 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_mov_b32_e32 v19, s3 ; CI-NEXT: v_mov_b32_e32 v18, s2 -; CI-NEXT: s_add_u32 s2, s0, 0x70 -; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_mov_b32_e32 v13, s1 ; CI-NEXT: v_mov_b32_e32 v12, s0 +; CI-NEXT: s_add_u32 s2, s0, 0x70 +; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v21, v5 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 ; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v5 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 @@ -2257,33 +2258,34 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; CI-NEXT: s_add_u32 s2, s0, 0x60 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; CI-NEXT: v_mov_b32_e32 v17, s3 +; CI-NEXT: v_mov_b32_e32 v14, s2 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; CI-NEXT: s_add_u32 s2, s0, 0x60 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v5 +; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_mov_b32_e32 v16, s2 ; CI-NEXT: s_add_u32 s2, s0, 0x50 -; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; CI-NEXT: s_add_u32 s0, s0, 64 +; CI-NEXT: v_mov_b32_e32 v17, s3 +; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] -; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: s_add_u32 s0, s0, 64 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 +; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v19, s3 -; CI-NEXT: v_mov_b32_e32 v13, s1 ; CI-NEXT: v_mov_b32_e32 v18, s2 +; CI-NEXT: v_mov_b32_e32 v13, s1 ; CI-NEXT: v_mov_b32_e32 v12, s0 ; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] @@ -2317,10 +2319,10 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v18, s3 ; VI-NEXT: v_mov_b32_e32 v17, s2 -; VI-NEXT: s_add_u32 s2, s0, 0x50 ; VI-NEXT: v_mov_b32_e32 v12, s1 -; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v11, s0 +; VI-NEXT: s_add_u32 s2, s0, 0x50 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -2332,22 +2334,22 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-NEXT: v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; VI-NEXT: v_mov_b32_e32 v14, s3 +; VI-NEXT: v_mov_b32_e32 v13, s2 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 ; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; VI-NEXT: v_mov_b32_e32 v13, s2 ; VI-NEXT: s_add_u32 s2, s0, 64 +; VI-NEXT: v_mov_b32_e32 v14, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] -; VI-NEXT: v_mov_b32_e32 v16, s3 +; VI-NEXT: v_mov_b32_e32 v15, s2 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 ; VI-NEXT: v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v4 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 -; VI-NEXT: v_mov_b32_e32 v15, s2 ; VI-NEXT: s_add_u32 s2, s0, 0x70 +; VI-NEXT: v_mov_b32_e32 v16, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] ; VI-NEXT: v_cvt_f32_f16_sdwa v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -2365,15 +2367,15 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v10 ; VI-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 ; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 -; VI-NEXT: s_add_u32 s0, s0, 0x60 ; VI-NEXT: flat_store_dwordx4 v[13:14], v[1:4] -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s0, 0x60 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 ; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v20, s3 -; VI-NEXT: v_mov_b32_e32 v14, s1 ; VI-NEXT: v_mov_b32_e32 v19, s2 +; VI-NEXT: v_mov_b32_e32 v14, s1 ; VI-NEXT: v_mov_b32_e32 v13, s0 ; VI-NEXT: flat_store_dwordx4 v[15:16], v[9:12] ; VI-NEXT: flat_store_dwordx4 v[19:20], v[0:3] @@ -2646,8 +2648,8 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: flat_store_short v[0:1], v2 -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_or_b32_e32 v2, v4, v3 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -2665,14 +2667,14 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; VI-NEXT: s_add_u32 s2, s0, 4 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; VI-NEXT: v_cvt_f16_f32_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_endpgm @@ -2811,8 +2813,8 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_add_u32 s2, s2, 16 +; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_mov_b32_e32 v4, s2 @@ -2849,8 +2851,8 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 @@ -2943,15 +2945,15 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; CI-NEXT: s_add_u32 s4, s2, 32 ; CI-NEXT: s_addc_u32 s5, s3, 0 ; CI-NEXT: v_mov_b32_e32 v0, s4 -; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_add_u32 s4, s2, 48 +; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_addc_u32 s5, s3, 0 -; CI-NEXT: v_mov_b32_e32 v9, s3 -; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_mov_b32_e32 v8, s2 ; CI-NEXT: s_add_u32 s2, s2, 16 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_mov_b32_e32 v5, s5 +; CI-NEXT: v_mov_b32_e32 v9, s3 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; CI-NEXT: v_mov_b32_e32 v13, s3 @@ -2964,10 +2966,11 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: s_waitcnt vmcnt(2) ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_cvt_f16_f32_e32 v16, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; CI-NEXT: v_cvt_f16_f32_e32 v17, v4 ; CI-NEXT: s_waitcnt vmcnt(1) @@ -2980,12 +2983,11 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 ; CI-NEXT: v_or_b32_e32 v1, v2, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 ; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: v_or_b32_e32 v0, v0, v18 ; CI-NEXT: v_or_b32_e32 v3, v6, v2 @@ -3014,14 +3016,14 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; VI-NEXT: s_add_u32 s4, s2, 32 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_add_u32 s4, s2, 48 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v9, s3 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v9, s3 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -3379,50 +3381,50 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; CI-NEXT: s_lshr_b32 s0, s4, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v8, s0 ; CI-NEXT: s_lshr_b32 s0, s5, 16 -; CI-NEXT: s_lshr_b32 s11, s1, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v9, s0 ; CI-NEXT: s_lshr_b32 s0, s6, 16 +; CI-NEXT: s_lshr_b32 s11, s1, 16 ; CI-NEXT: s_lshr_b32 s12, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s10 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s11 ; CI-NEXT: s_lshr_b32 s10, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v10, s0 ; CI-NEXT: s_lshr_b32 s0, s7, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s11 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s12 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s10 -; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v11, s0 -; CI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; CI-NEXT: v_cvt_f32_f16_e32 v13, s5 +; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 ; CI-NEXT: v_cvt_f32_f16_e32 v6, s2 ; CI-NEXT: v_cvt_f32_f16_e32 v7, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; CI-NEXT: v_cvt_f32_f16_e32 v13, s5 ; CI-NEXT: v_cvt_f32_f16_e32 v14, s7 ; CI-NEXT: v_cvt_f32_f16_e32 v15, s6 -; CI-NEXT: v_add_f32_e32 v1, v1, v9 -; CI-NEXT: v_add_f32_e32 v0, v0, v8 ; CI-NEXT: v_add_f32_e32 v3, v3, v11 ; CI-NEXT: v_add_f32_e32 v2, v2, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_add_f32_e32 v5, v5, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_add_f32_e32 v4, v4, v12 +; CI-NEXT: v_add_f32_e32 v1, v1, v9 +; CI-NEXT: v_add_f32_e32 v0, v0, v8 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_add_f32_e32 v7, v7, v14 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_add_f32_e32 v6, v6, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v5, v5, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_add_f32_e32 v4, v4, v12 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_or_b32_e32 v3, v7, v3 +; CI-NEXT: v_or_b32_e32 v2, v6, v2 ; CI-NEXT: v_or_b32_e32 v1, v5, v1 ; CI-NEXT: v_or_b32_e32 v0, v4, v0 ; CI-NEXT: v_mov_b32_e32 v4, s8 -; CI-NEXT: v_or_b32_e32 v3, v7, v3 -; CI-NEXT: v_or_b32_e32 v2, v6, v2 ; CI-NEXT: v_mov_b32_e32 v5, s9 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 76f204dd0c16a..29aedda49da70 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s + define void @main(i1 %arg) #0 { ; CHECK-LABEL: main: ; CHECK: ; %bb.0: ; %bb @@ -151,8 +152,8 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: ; %bb.5: ; %bb43 ; CHECK-NEXT: s_mov_b32 s16, 0 ; CHECK-NEXT: s_mov_b32 s17, s16 -; CHECK-NEXT: v_mov_b32_e32 v0, s16 ; CHECK-NEXT: v_readlane_b32 s44, v7, 16 +; CHECK-NEXT: v_mov_b32_e32 v0, s16 ; CHECK-NEXT: v_mov_b32_e32 v1, s17 ; CHECK-NEXT: s_mov_b32 s18, s16 ; CHECK-NEXT: s_mov_b32 s19, s16 @@ -203,10 +204,10 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b32 s16, 0 ; CHECK-NEXT: s_mov_b32 s20, s16 ; CHECK-NEXT: s_mov_b32 s21, s16 -; CHECK-NEXT: v_mov_b32_e32 v1, s20 ; CHECK-NEXT: s_mov_b32 s17, s16 ; CHECK-NEXT: s_mov_b32 s18, s16 ; CHECK-NEXT: s_mov_b32 s19, s16 +; CHECK-NEXT: v_mov_b32_e32 v1, s20 ; CHECK-NEXT: v_mov_b32_e32 v2, s21 ; CHECK-NEXT: image_sample_lz v3, v[1:2], s[8:15], s[16:19] dmask:0x1 ; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[16:19] dmask:0x1 diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll index 2daed9b69384f..23ce500e7b25b 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -286,11 +286,11 @@ define amdgpu_kernel void @llvm_ubsantrap() { define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_add_u32 s0, s8, 8 ; GFX8V4-NEXT: v_mov_b32_e32 v0, s6 ; GFX8V4-NEXT: v_mov_b32_e32 v1, s7 -; GFX8V4-NEXT: s_add_u32 s0, s8, 8 -; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V4-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V4-NEXT: s_waitcnt vmcnt(0) ; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V4-NEXT: v_mov_b32_e32 v1, s1 @@ -311,11 +311,11 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { ; ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_add_u32 s0, s8, 8 ; GFX8V5-NEXT: v_mov_b32_e32 v0, s6 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s7 -; GFX8V5-NEXT: s_add_u32 s0, s8, 8 -; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V5-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 8fcf1ad3fbc95..5ab2dcbedb537 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -2125,15 +2125,15 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 -; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-MOVREL-NEXT: s_nop 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-MOVREL-NEXT: s_nop 0 @@ -2174,15 +2174,15 @@ define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) { ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-IDXMODE-NEXT: s_nop 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-IDXMODE-NEXT: s_nop 0 @@ -2479,15 +2479,15 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16 -; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-MOVREL-NEXT: s_nop 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-MOVREL-NEXT: s_nop 0 @@ -2528,15 +2528,15 @@ define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %ou ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-IDXMODE-NEXT: s_nop 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-IDXMODE-NEXT: s_nop 0 @@ -2837,15 +2837,15 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 -; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-MOVREL-NEXT: s_nop 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-MOVREL-NEXT: s_nop 0 @@ -2887,15 +2887,15 @@ define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-IDXMODE-NEXT: s_nop 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-IDXMODE-NEXT: s_nop 0 @@ -3192,15 +3192,15 @@ define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16 -; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-MOVREL-NEXT: s_nop 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-MOVREL-NEXT: s_nop 0 @@ -3240,15 +3240,15 @@ define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) { ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-IDXMODE-NEXT: s_nop 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-IDXMODE-NEXT: s_nop 0 @@ -3509,7 +3509,6 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; VI-MOVREL-NEXT: s_add_i32 m0, s2, 0xfffffe00 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 4 ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 5 ; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 6 @@ -3525,12 +3524,13 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-MOVREL-NEXT: s_nop 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-MOVREL-NEXT: s_nop 0 @@ -3570,15 +3570,15 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-IDXMODE-NEXT: s_nop 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-IDXMODE-NEXT: s_nop 0 @@ -3595,6 +3595,7 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; GFX9-IDXMODE: ; %bb.0: ; %entry ; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; GFX9-IDXMODE-NEXT: s_load_dword s2, s[4:5], 0x34 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 2 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 3 @@ -3610,10 +3611,9 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 13 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 14 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 15 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v15 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v15 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, v14 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, v13 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, v12 @@ -3826,7 +3826,6 @@ define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ; VI-MOVREL-NEXT: s_add_i32 m0, s2, 0xfffffe00 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s9 ; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s10 ; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s11 @@ -3845,12 +3844,13 @@ define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, 5 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-MOVREL-NEXT: s_nop 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-MOVREL-NEXT: s_nop 0 @@ -3891,15 +3891,15 @@ define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 5 ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-IDXMODE-NEXT: s_nop 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-IDXMODE-NEXT: s_nop 0 @@ -4328,9 +4328,9 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12 ; VI-NEXT: v_cndmask_b32_e64 v6, 7, 33, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; VI-NEXT: v_cndmask_b32_e64 v5, 6, 33, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c ; VI-NEXT: v_cndmask_b32_e64 v4, 5, 33, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12 ; VI-NEXT: v_cndmask_b32_e64 v11, 12, 33, vcc @@ -4341,25 +4341,25 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 ; VI-NEXT: v_cndmask_b32_e64 v8, 9, 33, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: v_cndmask_b32_e64 v15, 16, 33, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12 -; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: v_cndmask_b32_e64 v14, 15, 33, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12 -; VI-NEXT: v_mov_b32_e32 v17, s3 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_cndmask_b32_e64 v13, 14, 33, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12 ; VI-NEXT: v_mov_b32_e32 v16, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 ; VI-NEXT: v_cndmask_b32_e64 v12, 13, 33, vcc +; VI-NEXT: v_mov_b32_e32 v17, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v13, s3 ; VI-NEXT: v_mov_b32_e32 v12, s2 ; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: v_mov_b32_e32 v13, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-NEXT: s_nop 0 @@ -4826,15 +4826,15 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_cndmask_b32_e32 v12, 13, v16, vcc -; VI-NEXT: v_mov_b32_e32 v17, s3 ; VI-NEXT: v_mov_b32_e32 v16, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: v_mov_b32_e32 v17, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v13, s3 ; VI-NEXT: v_mov_b32_e32 v12, s2 ; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: v_mov_b32_e32 v13, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-NEXT: s_nop 0 @@ -6399,16 +6399,16 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v19, s3 ; VI-NEXT: v_mov_b32_e32 v18, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: v_mov_b32_e32 v19, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[18:19], v[14:17] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v15, s3 ; VI-NEXT: v_mov_b32_e32 v14, s2 ; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: v_mov_b32_e32 v15, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6886,8 +6886,8 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000 ; SI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000 ; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32 -; SI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 ; SI-MOVREL-NEXT: s_add_i32 m0, s2, 2 +; SI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 ; SI-MOVREL-NEXT: v_mov_b32_e32 v30, v14 ; SI-MOVREL-NEXT: v_mov_b32_e32 v29, v13 ; SI-MOVREL-NEXT: v_mov_b32_e32 v28, v12 @@ -6941,8 +6941,8 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32 ; VI-MOVREL-NEXT: s_add_i32 m0, s2, 2 -; VI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48 +; VI-MOVREL-NEXT: v_mov_b32_e32 v31, v15 ; VI-MOVREL-NEXT: v_mov_b32_e32 v30, v14 ; VI-MOVREL-NEXT: v_mov_b32_e32 v29, v13 ; VI-MOVREL-NEXT: v_mov_b32_e32 v28, v12 @@ -6960,38 +6960,39 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, v0 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v16, v32 -; VI-MOVREL-NEXT: v_mov_b32_e32 v33, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v32, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v33, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[32:33], v[12:15] ; VI-MOVREL-NEXT: s_nop 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-MOVREL-NEXT: s_nop 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 64 -; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 +; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x70 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0 -; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x70 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0 +; VI-MOVREL-NEXT: s_nop 0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 -; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 ; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x60 -; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[28:31] +; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 ; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0 +; VI-MOVREL-NEXT: s_add_u32 s0, s0, 0x50 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[28:31] ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5 -; VI-MOVREL-NEXT: s_add_u32 s0, s0, 0x50 -; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; VI-MOVREL-NEXT: s_addc_u32 s1, s1, 0 +; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[20:23] @@ -7026,8 +7027,8 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-IDXMODE-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v32 ; VI-IDXMODE-NEXT: s_set_gpr_idx_off -; VI-IDXMODE-NEXT: v_mov_b32_e32 v31, v15 ; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v31, v15 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v30, v14 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v29, v13 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v28, v12 @@ -7048,38 +7049,39 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v33, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v32, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v33, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[32:33], v[12:15] ; VI-IDXMODE-NEXT: s_nop 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-IDXMODE-NEXT: s_nop 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 64 -; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 +; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x70 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0 -; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x70 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0 +; VI-IDXMODE-NEXT: s_nop 0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 ; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x60 -; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[28:31] +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 ; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0 +; VI-IDXMODE-NEXT: s_add_u32 s0, s0, 0x50 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[28:31] ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5 -; VI-IDXMODE-NEXT: s_add_u32 s0, s0, 0x50 -; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; VI-IDXMODE-NEXT: s_addc_u32 s1, s1, 0 +; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[20:23] @@ -7114,8 +7116,8 @@ define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %o ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v32 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v31, v15 ; GFX9-IDXMODE-NEXT: s_add_i32 s2, s2, 2 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v31, v15 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v30, v14 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v29, v13 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v28, v12 @@ -8003,13 +8005,13 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out ; VI-MOVREL-NEXT: s_add_u32 s4, s2, 48 ; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4 -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 ; VI-MOVREL-NEXT: s_add_u32 s4, s2, 32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 ; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0 +; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4 -; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16 ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 ; VI-MOVREL-NEXT: s_addc_u32 s3, s3, 0 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc @@ -8033,17 +8035,17 @@ define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out ; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 48 ; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 ; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 32 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 ; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3 +; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4 -; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s3, 0 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc @@ -8265,13 +8267,13 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p ; VI-MOVREL-NEXT: s_add_u32 s4, s2, 48 ; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4 -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 ; VI-MOVREL-NEXT: s_add_u32 s4, s2, 32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 ; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0 +; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4 -; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16 ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 ; VI-MOVREL-NEXT: s_addc_u32 s3, s3, 0 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc @@ -8295,17 +8297,17 @@ define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, p ; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 48 ; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 ; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 32 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 ; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3 +; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4 -; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s3, 0 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc @@ -8528,13 +8530,13 @@ define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ; VI-MOVREL-NEXT: s_add_u32 s4, s2, 48 ; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0 ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4 -; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 ; VI-MOVREL-NEXT: s_add_u32 s4, s2, 32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 ; VI-MOVREL-NEXT: s_addc_u32 s5, s3, 0 +; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc ; VI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s4 -; VI-MOVREL-NEXT: s_add_u32 s2, s2, 16 ; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s5 ; VI-MOVREL-NEXT: s_addc_u32 s3, s3, 0 ; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc @@ -8558,17 +8560,17 @@ define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 48 ; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 ; VI-IDXMODE-NEXT: s_add_u32 s4, s2, 32 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 ; VI-IDXMODE-NEXT: s_addc_u32 s5, s3, 0 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3 +; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc ; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s4 -; VI-IDXMODE-NEXT: s_add_u32 s2, s2, 16 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s5 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s3, 0 ; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc @@ -8835,15 +8837,15 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, ; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s23 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16 -; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32 +; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-MOVREL-NEXT: s_nop 0 -; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2 ; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16 +; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3 ; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0 ; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-MOVREL-NEXT: s_nop 0 @@ -8885,15 +8887,15 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, ; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, v16 ; VI-IDXMODE-NEXT: s_set_gpr_idx_off ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-IDXMODE-NEXT: s_nop 0 -; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2 ; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16 +; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3 ; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0 ; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-IDXMODE-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index e1b4cad370f96..1ce5ff51e6a5b 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -23,8 +23,8 @@ define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -68,11 +68,11 @@ define amdgpu_kernel void @int4_inselt(ptr addrspace(1) %out, <4 x i32> %vec, i3 ; GCN-NEXT: s_cselect_b32 s1, s1, 1 ; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cselect_b32 s0, s0, 1 -; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -95,8 +95,8 @@ define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm @@ -116,7 +116,6 @@ define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_add_u32 s2, s0, 16 -; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 @@ -124,8 +123,9 @@ define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec ; GCN-NEXT: v_mov_b32_e32 v5, s13 ; GCN-NEXT: v_mov_b32_e32 v6, s14 ; GCN-NEXT: v_mov_b32_e32 v7, s15 -; GCN-NEXT: v_mov_b32_e32 v9, s3 +; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 +; GCN-NEXT: v_mov_b32_e32 v9, s3 ; GCN-NEXT: v_mov_b32_e32 v8, s2 ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-NEXT: s_nop 0 @@ -149,7 +149,6 @@ define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %v ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: s_add_u32 s2, s0, 48 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 @@ -169,12 +168,13 @@ define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %v ; GCN-NEXT: v_mov_b32_e32 v16, s2 ; GCN-NEXT: s_add_u32 s2, s0, 32 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 +; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: v_mov_b32_e32 v12, s2 ; GCN-NEXT: s_add_u32 s2, s0, 16 +; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 @@ -204,7 +204,6 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v ; GCN-NEXT: s_mov_b32 m0, s2 ; GCN-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v33, s3 ; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NEXT: v_mov_b32_e32 v3, s39 @@ -239,36 +238,37 @@ define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %v ; GCN-NEXT: v_mov_b32_e32 v32, s2 ; GCN-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 +; GCN-NEXT: v_mov_b32_e32 v33, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[32:33], v[28:31] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v29, s3 ; GCN-NEXT: v_mov_b32_e32 v28, s2 ; GCN-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-NEXT: v_mov_b32_e32 v29, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[28:29], v[24:27] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v25, s3 ; GCN-NEXT: v_mov_b32_e32 v24, s2 ; GCN-NEXT: s_add_u32 s2, s0, 64 +; GCN-NEXT: v_mov_b32_e32 v25, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[24:25], v[20:23] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v21, s3 ; GCN-NEXT: v_mov_b32_e32 v20, s2 ; GCN-NEXT: s_add_u32 s2, s0, 48 +; GCN-NEXT: v_mov_b32_e32 v21, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: v_mov_b32_e32 v16, s2 ; GCN-NEXT: s_add_u32 s2, s0, 32 +; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: v_mov_b32_e32 v12, s2 ; GCN-NEXT: s_add_u32 s2, s0, 16 +; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 @@ -300,8 +300,8 @@ define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, ; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm @@ -428,8 +428,8 @@ define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, ; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm @@ -452,8 +452,8 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3 ; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm @@ -550,11 +550,11 @@ define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec, ; GCN-NEXT: s_or_b32 s0, s0, s8 ; GCN-NEXT: s_and_b32 s0, s0, 0xffff ; GCN-NEXT: s_or_b32 s0, s0, s7 -; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -577,11 +577,11 @@ define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %v ; GCN-NEXT: s_cmp_eq_u32 s6, 0 ; GCN-NEXT: s_cselect_b32 s1, 0x3ff00000, s1 ; GCN-NEXT: s_cselect_b32 s0, 0, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -618,13 +618,13 @@ define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %v ; GCN-NEXT: s_add_u32 s0, s10, 16 ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: s_addc_u32 s1, s11, 0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: s_add_u32 s0, s10, 32 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NEXT: s_add_u32 s0, s10, 32 ; GCN-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -633,9 +633,9 @@ define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %v ; GCN-NEXT: s_addc_u32 s1, s11, 0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm @@ -674,15 +674,15 @@ define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %v ; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: v_movreld_b32_e32 v1, v16 -; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: v_mov_b32_e32 v16, s2 ; GCN-NEXT: s_add_u32 s2, s0, 32 +; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: v_mov_b32_e32 v12, s2 ; GCN-NEXT: s_add_u32 s2, s0, 16 +; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 @@ -732,17 +732,18 @@ define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %v ; GCN-NEXT: s_addc_u32 s1, s7, 0 ; GCN-NEXT: v_mov_b32_e32 v15, s1 ; GCN-NEXT: v_mov_b32_e32 v14, s0 -; GCN-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GCN-NEXT: s_add_u32 s0, s6, 48 +; GCN-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GCN-NEXT: s_addc_u32 s1, s7, 0 ; GCN-NEXT: v_mov_b32_e32 v4, s6 ; GCN-NEXT: v_mov_b32_e32 v5, s7 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NEXT: s_addc_u32 s1, s7, 0 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_add_u32 s0, s6, 32 -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[12:13] +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_addc_u32 s1, s7, 0 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[12:13] ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx4 v[0:1], v[8:11] @@ -800,39 +801,39 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> ; GCN-NEXT: v_movreld_b32_e32 v0, 0 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: v_movreld_b32_e32 v1, v32 -; GCN-NEXT: v_mov_b32_e32 v33, s3 ; GCN-NEXT: v_mov_b32_e32 v32, s2 ; GCN-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-NEXT: v_mov_b32_e32 v33, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[32:33], v[28:31] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v29, s3 ; GCN-NEXT: v_mov_b32_e32 v28, s2 ; GCN-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-NEXT: v_mov_b32_e32 v29, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[28:29], v[24:27] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v25, s3 ; GCN-NEXT: v_mov_b32_e32 v24, s2 ; GCN-NEXT: s_add_u32 s2, s0, 64 +; GCN-NEXT: v_mov_b32_e32 v25, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[24:25], v[20:23] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v21, s3 ; GCN-NEXT: v_mov_b32_e32 v20, s2 ; GCN-NEXT: s_add_u32 s2, s0, 48 +; GCN-NEXT: v_mov_b32_e32 v21, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: v_mov_b32_e32 v16, s2 ; GCN-NEXT: s_add_u32 s2, s0, 32 +; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: v_mov_b32_e32 v12, s2 ; GCN-NEXT: s_add_u32 s2, s0, 16 +; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 @@ -898,43 +899,44 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-NEXT: v_movreld_b32_e32 v1, v32 ; GCN-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v31, s3 ; GCN-NEXT: v_mov_b32_e32 v30, s2 ; GCN-NEXT: s_add_u32 s2, s0, 64 +; GCN-NEXT: v_mov_b32_e32 v31, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[30:31], v[20:23] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v21, s3 ; GCN-NEXT: v_mov_b32_e32 v20, s2 ; GCN-NEXT: s_add_u32 s2, s0, 48 +; GCN-NEXT: v_mov_b32_e32 v21, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: v_mov_b32_e32 v16, s2 ; GCN-NEXT: s_add_u32 s2, s0, 32 +; GCN-NEXT: v_mov_b32_e32 v17, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: v_mov_b32_e32 v12, s2 ; GCN-NEXT: s_add_u32 s2, s0, 16 +; GCN-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v9, s3 ; GCN-NEXT: v_mov_b32_e32 v8, s2 -; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-NEXT: v_mov_b32_e32 v9, s3 +; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: s_add_u32 s0, s0, 0x60 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_add_u32 s0, s0, 0x60 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[28:29] -; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx4 v[0:1], v[24:27] @@ -1830,9 +1832,9 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_readlane_b32 s0, v6, 0 ; GCN-NEXT: v_readlane_b32 s1, v6, 1 -; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 7cbf9aeacfe48..b93b29464b309 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1979,10 +1979,10 @@ define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace( ; SI-NEXT: .LBB42_2: ; %if ; SI-NEXT: s_load_dword s5, s[2:3], 0x0 ; SI-NEXT: .LBB42_3: ; %endif -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -2003,10 +2003,10 @@ define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s5, s[2:3], 0x0 ; VI-NEXT: .LBB42_3: ; %endif -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll index 300124848c1aa..cb9b6888d1fbc 100644 --- a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll @@ -40,10 +40,10 @@ define amdgpu_gfx [13 x i32] @issue130120() { ; CHECK-NEXT: s_mov_b32 s50, s48 ; CHECK-NEXT: s_cselect_b32 s51, 0, s1 ; CHECK-NEXT: s_cselect_b32 s55, 0, s35 -; CHECK-NEXT: v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49 ; CHECK-NEXT: s_cselect_b32 s52, 0, s2 ; CHECK-NEXT: s_cselect_b32 s56, 0, s36 ; CHECK-NEXT: s_cselect_b32 vcc_lo, 0, s43 +; CHECK-NEXT: v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49 ; CHECK-NEXT: v_mov_b32_e32 v4, s50 ; CHECK-NEXT: s_cselect_b32 s47, s45, 0xf0 ; CHECK-NEXT: s_cselect_b32 s53, 0, s3 diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll index c001df48499c7..f316f3d5defaa 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll @@ -98,10 +98,10 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_or_b32_e32 v1, v1, v5 ; GCN-NEXT: v_or_b32_e32 v0, v0, v4 ; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_mov_b32_e32 v1, v9 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: v_or_b32_e32 v8, v15, v0 ; GCN-NEXT: v_mov_b32_e32 v0, v8 -; GCN-NEXT: v_mov_b32_e32 v1, v9 ; GCN-NEXT: .LBB0_7: ; %Flow1 ; GCN-NEXT: s_or_b64 exec, exec, s[12:13] ; GCN-NEXT: .LBB0_8: ; %Flow2 @@ -227,10 +227,10 @@ define bfloat @uitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_or_b32_e32 v1, v1, v3 ; GCN-NEXT: v_or_b32_e32 v0, v0, v2 ; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: v_mov_b32_e32 v1, v8 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: v_or_b32_e32 v7, v14, v0 ; GCN-NEXT: v_mov_b32_e32 v0, v7 -; GCN-NEXT: v_mov_b32_e32 v1, v8 ; GCN-NEXT: .LBB1_7: ; %Flow1 ; GCN-NEXT: s_or_b64 exec, exec, s[12:13] ; GCN-NEXT: .LBB1_8: ; %Flow2 diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll index 2f9182e6e7c6a..56df10707667f 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll @@ -94,10 +94,10 @@ define float @sitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v1, v1, v5 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_mov_b32_e32 v1, v9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v8, v15, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, v8 -; SDAG-NEXT: v_mov_b32_e32 v1, v9 ; SDAG-NEXT: .LBB0_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB0_8: ; %Flow2 @@ -224,12 +224,10 @@ define float @sitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 ; GISEL-NEXT: v_and_or_b32 v1, v11, v1, v3 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v3, v13, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mov_b32_e32 v1, v4 -; GISEL-NEXT: v_mov_b32_e32 v2, v5 -; GISEL-NEXT: v_mov_b32_e32 v3, v6 ; GISEL-NEXT: .LBB0_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: .LBB0_8: ; %Flow2 @@ -349,10 +347,10 @@ define float @uitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v1, v1, v3 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_mov_b32_e32 v1, v8 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v7, v14, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, v7 -; SDAG-NEXT: v_mov_b32_e32 v1, v8 ; SDAG-NEXT: .LBB1_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB1_8: ; %Flow2 @@ -469,12 +467,10 @@ define float @uitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 ; GISEL-NEXT: v_and_or_b32 v1, v10, v1, v3 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v3, v12, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mov_b32_e32 v1, v4 -; GISEL-NEXT: v_mov_b32_e32 v2, v5 -; GISEL-NEXT: v_mov_b32_e32 v3, v6 ; GISEL-NEXT: .LBB1_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: .LBB1_8: ; %Flow2 @@ -610,11 +606,11 @@ define double @sitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v5, v5, v7 ; SDAG-NEXT: v_or_b32_e32 v4, v4, v6 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; SDAG-NEXT: v_mov_b32_e32 v6, v10 +; SDAG-NEXT: v_mov_b32_e32 v5, v1 ; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 ; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: v_mov_b32_e32 v6, v10 ; SDAG-NEXT: v_mov_b32_e32 v7, v11 ; SDAG-NEXT: .LBB2_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] @@ -661,9 +657,9 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v4, v0 ; GISEL-NEXT: v_mov_b32_e32 v5, v1 -; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_or_b32_e32 v0, v4, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v5, v3 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-NEXT: v_mov_b32_e32 v1, s5 @@ -756,27 +752,24 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_and_or_b32 v0, v9, v2, v0 ; GISEL-NEXT: v_and_or_b32 v1, v12, v3, v1 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v3, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v9, v14, v0 ; GISEL-NEXT: v_mov_b32_e32 v2, v9 -; GISEL-NEXT: v_mov_b32_e32 v3, v10 ; GISEL-NEXT: v_mov_b32_e32 v4, v11 -; GISEL-NEXT: v_mov_b32_e32 v5, v12 ; GISEL-NEXT: .LBB2_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: .LBB2_8: ; %Flow2 ; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] -; GISEL-NEXT: s_cbranch_execz .LBB2_10 ; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb ; GISEL-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] ; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[2:3] ; GISEL-NEXT: v_lshrrev_b32_e32 v2, 31, v3 ; GISEL-NEXT: v_or_b32_e32 v2, v4, v2 -; GISEL-NEXT: v_mov_b32_e32 v5, v3 ; GISEL-NEXT: v_mov_b32_e32 v4, v2 ; GISEL-NEXT: v_mov_b32_e32 v3, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v0 -; GISEL-NEXT: .LBB2_10: ; %itofp-sw-epilog +; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GISEL-NEXT: v_bfe_u32 v0, v2, 2, 1 ; GISEL-NEXT: v_or_b32_e32 v0, v2, v0 @@ -898,11 +891,11 @@ define double @uitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v1, v1, v3 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; SDAG-NEXT: v_mov_b32_e32 v2, v9 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v4, v4, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, v4 -; SDAG-NEXT: v_mov_b32_e32 v1, v5 +; SDAG-NEXT: v_mov_b32_e32 v2, v9 ; SDAG-NEXT: v_mov_b32_e32 v3, v10 ; SDAG-NEXT: .LBB3_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] @@ -945,9 +938,9 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-LABEL: uitofp_i128_to_f64: ; GISEL: ; %bb.0: ; %itofp-entry ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1032,10 +1025,10 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_and_or_b32 v0, v8, v0, v2 ; GISEL-NEXT: v_and_or_b32 v1, v12, v1, v3 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v8, v14, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v8 -; GISEL-NEXT: v_mov_b32_e32 v1, v9 ; GISEL-NEXT: v_mov_b32_e32 v2, v10 ; GISEL-NEXT: v_mov_b32_e32 v3, v11 ; GISEL-NEXT: .LBB3_7: ; %Flow1 @@ -1044,8 +1037,8 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] ; GISEL-NEXT: s_cbranch_execz .LBB3_10 ; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb -; GISEL-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] ; GISEL-NEXT: v_lshlrev_b64 v[10:11], 1, v[2:3] +; GISEL-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] ; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GISEL-NEXT: v_or_b32_e32 v10, v10, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v8 @@ -1183,10 +1176,10 @@ define half @sitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v1, v1, v5 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_mov_b32_e32 v1, v9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v8, v15, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, v8 -; SDAG-NEXT: v_mov_b32_e32 v1, v9 ; SDAG-NEXT: .LBB4_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB4_8: ; %Flow2 @@ -1314,12 +1307,10 @@ define half @sitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 ; GISEL-NEXT: v_and_or_b32 v1, v11, v1, v3 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v3, v13, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mov_b32_e32 v1, v4 -; GISEL-NEXT: v_mov_b32_e32 v2, v5 -; GISEL-NEXT: v_mov_b32_e32 v3, v6 ; GISEL-NEXT: .LBB4_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: .LBB4_8: ; %Flow2 @@ -1440,10 +1431,10 @@ define half @uitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_or_b32_e32 v1, v1, v3 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_mov_b32_e32 v1, v8 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v7, v14, v0 ; SDAG-NEXT: v_mov_b32_e32 v0, v7 -; SDAG-NEXT: v_mov_b32_e32 v1, v8 ; SDAG-NEXT: .LBB5_7: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB5_8: ; %Flow2 @@ -1561,12 +1552,10 @@ define half @uitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 ; GISEL-NEXT: v_and_or_b32 v1, v10, v1, v3 ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_or_b32_e32 v3, v12, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mov_b32_e32 v1, v4 -; GISEL-NEXT: v_mov_b32_e32 v2, v5 -; GISEL-NEXT: v_mov_b32_e32 v3, v6 ; GISEL-NEXT: .LBB5_7: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: .LBB5_8: ; %Flow2 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index a2da8876472ab..a05977d630217 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -734,8 +734,8 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -797,8 +797,8 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -863,11 +863,11 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 2 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_byte v[2:3], v5 @@ -1002,10 +1002,10 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v5, s2 ; VI-NEXT: flat_store_short v[2:3], v4 @@ -1121,9 +1121,9 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-NEXT: s_endpgm @@ -1200,9 +1200,9 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-NEXT: s_endpgm @@ -1335,8 +1335,8 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -1400,8 +1400,8 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -1474,8 +1474,8 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -1549,10 +1549,10 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v5, s2 ; VI-NEXT: flat_store_byte v[2:3], v4 @@ -1693,13 +1693,13 @@ define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16> ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 8 ; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[2:3], v4 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -1926,13 +1926,13 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s6, 16 ; VI-NEXT: s_addc_u32 s5, s7, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -2024,15 +2024,15 @@ define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s6, 16 ; VI-NEXT: s_addc_u32 s5, s7, 0 -; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v3, s8 +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_store_dword v[1:2], v3 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -2129,19 +2129,19 @@ define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s12, s8, 32 -; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: s_addc_u32 s13, s9, 0 -; VI-NEXT: v_mov_b32_e32 v3, s12 +; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: v_mov_b32_e32 v2, s11 ; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v3, s12 ; VI-NEXT: v_mov_b32_e32 v4, s13 ; VI-NEXT: s_add_u32 s4, s8, 16 ; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s9, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v4, s8 @@ -2271,19 +2271,19 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s12, s8, 32 -; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: s_addc_u32 s13, s9, 0 -; VI-NEXT: v_mov_b32_e32 v3, s12 +; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: v_mov_b32_e32 v2, s11 ; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v3, s12 ; VI-NEXT: v_mov_b32_e32 v4, s13 ; VI-NEXT: s_add_u32 s4, s8, 16 ; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s9, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v4, s8 @@ -2400,8 +2400,8 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -2653,8 +2653,8 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -2909,10 +2909,10 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -3020,10 +3020,10 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -3124,8 +3124,8 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -3582,10 +3582,10 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -4050,21 +4050,21 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32 ; VI-NEXT: v_mov_b32_e32 v0, s20 ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 ; VI-NEXT: v_mov_b32_e32 v1, s21 ; VI-NEXT: v_mov_b32_e32 v2, s22 ; VI-NEXT: v_mov_b32_e32 v3, s23 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s3 @@ -4238,21 +4238,21 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo ; VI-NEXT: v_mov_b32_e32 v0, s20 ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 ; VI-NEXT: v_mov_b32_e32 v1, s21 ; VI-NEXT: v_mov_b32_e32 v2, s22 ; VI-NEXT: v_mov_b32_e32 v3, s23 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s3 @@ -4544,13 +4544,13 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s4, s6, 1 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 8 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_byte v[4:5], v6 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -4563,8 +4563,8 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_byte v2, v3, s[0:1] offset:8 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -4821,8 +4821,8 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 1 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -4971,8 +4971,8 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -5095,10 +5095,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5120,10 +5120,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -5208,8 +5208,9 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -5231,18 +5232,18 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: s_add_u32 s2, s4, 50 ; VI-NEXT: s_addc_u32 s3, s5, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s0, 3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_add_u32 s0, s4, 51 -; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v7, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: v_mov_b32_e32 v7, s1 ; VI-NEXT: flat_load_ubyte v8, v[0:1] ; VI-NEXT: flat_load_ubyte v9, v[2:3] ; VI-NEXT: flat_load_ubyte v10, v[4:5] @@ -5680,12 +5681,12 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { ; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: s_add_u32 s2, s0, 2 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s4, 42 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ushort v4, v[4:5] @@ -6011,22 +6012,22 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace ; VI-NEXT: v_mov_b32_e32 v0, s20 ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 ; VI-NEXT: v_mov_b32_e32 v1, s21 ; VI-NEXT: v_mov_b32_e32 v2, s22 ; VI-NEXT: v_mov_b32_e32 v3, s23 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll index 2fa865ff4929c..c19f6f3b810e4 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll @@ -133,8 +133,8 @@ define amdgpu_kernel void @v5i32_arg(<5 x i32> %in) nounwind { ; GCN-NEXT: v_mov_b32_e32 v6, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v8, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -156,8 +156,8 @@ define amdgpu_kernel void @v6i32_arg(<6 x i32> %in) nounwind { ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v9, s5 ; GCN-NEXT: v_mov_b32_e32 v8, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -176,8 +176,8 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) #0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s4, s4, 1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: global_store_byte v2, v3, s[0:1] offset:8 ; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -216,10 +216,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: global_store_dword v[0:1], v2, off ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -578,8 +578,8 @@ define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, p ; GCN-LABEL: byref_flat_i32_arg: ; GCN: ; %bb.0: ; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: flat_load_dword v0, v[0:1] offset:8 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll index 217c306a1ff93..d262f7dc03ac6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll @@ -59,8 +59,8 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_f32_vv(<16 x float> %src, float %s define amdgpu_ps void @test_scalef32_pk32_fp6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) { ; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f32_sl: ; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9] @@ -119,8 +119,8 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_f32_vv(<16 x float> %src, float %s define amdgpu_ps void @test_scalef32_pk32_bf6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) { ; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f32_sl: ; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9] @@ -1477,8 +1477,8 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_f32_vv_inreg_src(<16 x float> inre define amdgpu_ps void @test_scalef32_pk32_fp6_f32_sl_inreg_src(<16 x float> inreg %src, ptr addrspace(1) %out) { ; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f32_sl_inreg_src: ; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9] @@ -1553,8 +1553,8 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_f32_vv_inreg_src(<16 x float> inre define amdgpu_ps void @test_scalef32_pk32_bf6_f32_sl_inreg_src(<16 x float> inreg inreg %src, ptr addrspace(1) %out) { ; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f32_sl_inreg_src: ; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9] @@ -2197,8 +2197,8 @@ define <32 x float> @test_cvt_scale_pk32_f32_fp6_inreg_src(<6 x i32> inreg %src, ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[34:39], v32 @@ -2226,8 +2226,8 @@ define <32 x float> @test_cvt_scale_pk32_f32_bf6_inreg_src(<6 x i32> inreg %src, ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[34:39], v32 @@ -2255,8 +2255,8 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_vv_inreg_src(<6 x i32> inreg ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[18:23], v16 @@ -2347,8 +2347,8 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_vv_inreg_src(<6 x i32> inreg ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[18:23], v16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll index 6cc6ba732d805..e9834de001321 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll @@ -74,6 +74,7 @@ define i32 @dead_i32(i1 %cond, i32 %x, ptr addrspace(1) %ptr1) #0 { ; ASM-GISEL-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; ASM-GISEL-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; ASM-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; ASM-DAG: ; %bb.0: ; %entry entry: %dead = call i32 @llvm.amdgcn.dead.i32() br i1 %cond, label %if.then, label %if.end @@ -150,21 +151,21 @@ define %trivial_types @dead_struct(i1 %cond, %trivial_types %x, ptr addrspace(1) ; ASM-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 ; ASM-GISEL-TRUE16-NEXT: s_mov_b32 s1, 0x3fc00000 ; ASM-GISEL-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; ASM-GISEL-TRUE16-NEXT: s_mov_b32 s7, s4 ; ASM-GISEL-TRUE16-NEXT: s_mov_b32 s5, s4 ; ASM-GISEL-TRUE16-NEXT: s_mov_b32 s6, s4 +; ASM-GISEL-TRUE16-NEXT: s_mov_b32 s7, s4 +; ASM-GISEL-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_add_nc_u32 v0, 15, v19 ; ASM-GISEL-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; ASM-GISEL-TRUE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v13, s6 -; ASM-GISEL-TRUE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_add_nc_u32 v0, 15, v19 ; ASM-GISEL-TRUE16-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v11, s4 +; ASM-GISEL-TRUE16-NEXT: global_store_b32 v[17:18], v0, off +; ASM-GISEL-TRUE16-NEXT: ; implicit-def: $vgpr0 ; ASM-GISEL-TRUE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; ASM-GISEL-TRUE16-NEXT: ; implicit-def: $vgpr6_vgpr7 ; ASM-GISEL-TRUE16-NEXT: ; implicit-def: $vgpr8 ; ASM-GISEL-TRUE16-NEXT: ; implicit-def: $vgpr9_vgpr10 ; ASM-GISEL-TRUE16-NEXT: ; implicit-def: $vgpr15 ; ASM-GISEL-TRUE16-NEXT: ; implicit-def: $vgpr16 -; ASM-GISEL-TRUE16-NEXT: global_store_b32 v[17:18], v0, off -; ASM-GISEL-TRUE16-NEXT: ; implicit-def: $vgpr0 ; ASM-GISEL-TRUE16-NEXT: .LBB1_2: ; %if.end ; ASM-GISEL-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; ASM-GISEL-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -195,21 +196,21 @@ define %trivial_types @dead_struct(i1 %cond, %trivial_types %x, ptr addrspace(1) ; ASM-GISEL-FAKE16-NEXT: s_mov_b32 s4, 0 ; ASM-GISEL-FAKE16-NEXT: s_mov_b32 s1, 0x3fc00000 ; ASM-GISEL-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; ASM-GISEL-FAKE16-NEXT: s_mov_b32 s7, s4 ; ASM-GISEL-FAKE16-NEXT: s_mov_b32 s5, s4 ; ASM-GISEL-FAKE16-NEXT: s_mov_b32 s6, s4 +; ASM-GISEL-FAKE16-NEXT: s_mov_b32 s7, s4 +; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_add_nc_u32 v0, 15, v19 ; ASM-GISEL-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v13, s6 -; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_add_nc_u32 v0, 15, v19 ; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v11, s4 +; ASM-GISEL-FAKE16-NEXT: global_store_b32 v[17:18], v0, off +; ASM-GISEL-FAKE16-NEXT: ; implicit-def: $vgpr0 ; ASM-GISEL-FAKE16-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; ASM-GISEL-FAKE16-NEXT: ; implicit-def: $vgpr6_vgpr7 ; ASM-GISEL-FAKE16-NEXT: ; implicit-def: $vgpr8 ; ASM-GISEL-FAKE16-NEXT: ; implicit-def: $vgpr9_vgpr10 ; ASM-GISEL-FAKE16-NEXT: ; implicit-def: $vgpr15 ; ASM-GISEL-FAKE16-NEXT: ; implicit-def: $vgpr16 -; ASM-GISEL-FAKE16-NEXT: global_store_b32 v[17:18], v0, off -; ASM-GISEL-FAKE16-NEXT: ; implicit-def: $vgpr0 ; ASM-GISEL-FAKE16-NEXT: .LBB1_2: ; %if.end ; ASM-GISEL-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; ASM-GISEL-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -221,6 +222,7 @@ define %trivial_types @dead_struct(i1 %cond, %trivial_types %x, ptr addrspace(1) ; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v12, v13 :: v_dual_mov_b32 v13, v14 ; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v14, v15 :: v_dual_mov_b32 v15, v16 ; ASM-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; ASM-DAG: ; %bb.0: ; %entry ; ASM-GISEL-LABEL: dead_struct: ; ASM-GISEL: ; %bb.0: ; %entry ; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -515,6 +517,7 @@ define [32 x i32] @dead_array(i1 %cond, [32 x i32] %x, ptr addrspace(1) %ptr1, i ; ASM-GISEL-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; ASM-GISEL-FAKE16-NEXT: s_wait_loadcnt 0x0 ; ASM-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; ASM-DAG: ; %bb.0: ; %entry ; ASM-GISEL-LABEL: dead_array: ; ASM-GISEL: ; %bb.0: ; %entry ; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll index 9e4824694e76a..38e91757b9763 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll @@ -45,8 +45,8 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -58,8 +58,8 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -101,8 +101,8 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -114,8 +114,8 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace( ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -204,8 +204,8 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -218,8 +218,8 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -264,8 +264,8 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -278,8 +278,8 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -324,8 +324,8 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -338,8 +338,8 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -384,8 +384,8 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -398,8 +398,8 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -444,8 +444,8 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -458,8 +458,8 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -504,8 +504,8 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -518,8 +518,8 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -564,8 +564,8 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -578,8 +578,8 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -624,8 +624,8 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -638,8 +638,8 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -684,8 +684,8 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -698,8 +698,8 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -744,8 +744,8 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -758,8 +758,8 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -804,8 +804,8 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -818,8 +818,8 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -864,8 +864,8 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -878,8 +878,8 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -924,8 +924,8 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -938,8 +938,8 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -984,8 +984,8 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -998,8 +998,8 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -1861,8 +1861,8 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -1876,8 +1876,8 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -1928,8 +1928,8 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0| ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -1943,8 +1943,8 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace( ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 ; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0| ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2034,8 +2034,8 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2048,8 +2048,8 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2095,8 +2095,8 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2109,8 +2109,8 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2156,8 +2156,8 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2170,8 +2170,8 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2217,8 +2217,8 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2231,8 +2231,8 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2278,8 +2278,8 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2292,8 +2292,8 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2339,8 +2339,8 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2353,8 +2353,8 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2400,8 +2400,8 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2414,8 +2414,8 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2461,8 +2461,8 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2475,8 +2475,8 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2522,8 +2522,8 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2536,8 +2536,8 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2583,8 +2583,8 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2597,8 +2597,8 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2644,8 +2644,8 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2658,8 +2658,8 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2704,8 +2704,8 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2718,8 +2718,8 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2764,8 +2764,8 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2778,8 +2778,8 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -2824,8 +2824,8 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm @@ -2838,8 +2838,8 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) { ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll index 366b71bae75c9..e1c671c4eeb0a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll @@ -44,8 +44,8 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -71,8 +71,8 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -146,8 +146,8 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -173,8 +173,8 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -206,8 +206,8 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -233,8 +233,8 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -266,8 +266,8 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -293,8 +293,8 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -326,8 +326,8 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -353,8 +353,8 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -386,8 +386,8 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -413,8 +413,8 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -446,8 +446,8 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -473,8 +473,8 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -506,8 +506,8 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -533,8 +533,8 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -566,8 +566,8 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -593,8 +593,8 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -626,8 +626,8 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -653,8 +653,8 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -684,8 +684,8 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -742,8 +742,8 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -800,8 +800,8 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -858,8 +858,8 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -916,8 +916,8 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -974,8 +974,8 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -1032,8 +1032,8 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -1090,8 +1090,8 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -1148,8 +1148,8 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -1206,8 +1206,8 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1] ; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SDAG-VI-NEXT: s_endpgm @@ -1266,8 +1266,8 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1293,8 +1293,8 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1368,8 +1368,8 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1395,8 +1395,8 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1428,8 +1428,8 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1455,8 +1455,8 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1488,8 +1488,8 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1515,8 +1515,8 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1548,8 +1548,8 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1575,8 +1575,8 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1608,8 +1608,8 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1635,8 +1635,8 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1668,8 +1668,8 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1695,8 +1695,8 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1728,8 +1728,8 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1755,8 +1755,8 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1788,8 +1788,8 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1815,8 +1815,8 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1848,8 +1848,8 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2 ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3 ; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SDAG-VI-NEXT: s_endpgm @@ -1875,8 +1875,8 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0 ; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-VI-NEXT: s_endpgm @@ -1912,8 +1912,8 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll index 80f295b939709..0f1a487d13431 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll @@ -15,8 +15,8 @@ define amdgpu_kernel void @MFMAExpInterleave(ptr addrspace(1) %out0, ptr addrspa ; GCN-NEXT: v_sub_f32_e32 v4, v2, v3 ; GCN-NEXT: v_fma_f32 v1, s6, v1, -v2 ; GCN-NEXT: v_mov_b32_e32 v2, 0x32a5705f -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 ; GCN-NEXT: v_fmac_f32_e32 v1, s6, v2 +; GCN-NEXT: v_accvgpr_write_b32 a0, s0 ; GCN-NEXT: v_accvgpr_write_b32 a1, s1 ; GCN-NEXT: v_accvgpr_write_b32 a2, s2 ; GCN-NEXT: v_accvgpr_write_b32 a3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll index 44a4e8171ff33..3df81ac2e551a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll @@ -18,11 +18,11 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GCN-LABEL: load_1d_lwe: ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v6, v0 ; GCN-NEXT: v_mov_b32_e32 v9, v8 ; GCN-NEXT: v_mov_b32_e32 v10, v8 ; GCN-NEXT: v_mov_b32_e32 v11, v8 ; GCN-NEXT: v_mov_b32_e32 v12, v8 +; GCN-NEXT: v_mov_b32_e32 v6, v0 ; GCN-NEXT: v_mov_b32_e32 v0, v8 ; GCN-NEXT: v_mov_b32_e32 v1, v9 ; GCN-NEXT: v_mov_b32_e32 v2, v10 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll index 4d9f0943a802d..fd9af6e536617 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -84,11 +84,11 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GFX6789-LABEL: load_1d_tfe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v6, 0 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v7, v6 ; GFX6789-NEXT: v_mov_b32_e32 v8, v6 ; GFX6789-NEXT: v_mov_b32_e32 v9, v6 ; GFX6789-NEXT: v_mov_b32_e32 v10, v6 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v6 ; GFX6789-NEXT: v_mov_b32_e32 v1, v7 ; GFX6789-NEXT: v_mov_b32_e32 v2, v8 @@ -131,13 +131,13 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GFX11-LABEL: load_1d_tfe: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: v_mov_b32_e32 v10, v6 -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10 ; GFX11-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v6, v4, s[8:9] @@ -199,11 +199,11 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GFX6789-LABEL: load_1d_lwe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v6, 0 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v7, v6 ; GFX6789-NEXT: v_mov_b32_e32 v8, v6 ; GFX6789-NEXT: v_mov_b32_e32 v9, v6 ; GFX6789-NEXT: v_mov_b32_e32 v10, v6 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v6 ; GFX6789-NEXT: v_mov_b32_e32 v1, v7 ; GFX6789-NEXT: v_mov_b32_e32 v2, v8 @@ -246,13 +246,13 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GFX11-LABEL: load_1d_lwe: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: v_mov_b32_e32 v10, v6 -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10 ; GFX11-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v6, v4, s[8:9] @@ -352,12 +352,12 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GFX6789-LABEL: load_2d_tfe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v7, 0 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v8, v7 ; GFX6789-NEXT: v_mov_b32_e32 v9, v7 ; GFX6789-NEXT: v_mov_b32_e32 v10, v7 ; GFX6789-NEXT: v_mov_b32_e32 v11, v7 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v7 ; GFX6789-NEXT: v_mov_b32_e32 v1, v8 ; GFX6789-NEXT: v_mov_b32_e32 v2, v9 @@ -401,13 +401,14 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GFX11-LABEL: load_2d_tfe: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, v1 -; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: v_mov_b32_e32 v9, v7 +; GFX11-NEXT: v_mov_b32_e32 v2, v9 ; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v11, v7 -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[8:9] @@ -417,11 +418,12 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v7, 0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v0 -; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v9, v7 -; GFX12-NEXT: v_dual_mov_b32 v10, v7 :: v_dual_mov_b32 v11, v7 +; GFX12-NEXT: v_mov_b32_e32 v9, v7 +; GFX12-NEXT: v_mov_b32_e32 v2, v9 +; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v11, v7 +; GFX12-NEXT: v_mov_b32_e32 v10, v7 ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: v_mov_b32_e32 v4, v11 +; GFX12-NEXT: v_dual_mov_b32 v4, v11 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: image_load v[0:4], [v6, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v7, v4, s[8:9] @@ -515,13 +517,13 @@ define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspa ; GFX6789-LABEL: load_3d_tfe_lwe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v8, 0 -; GFX6789-NEXT: v_mov_b32_e32 v7, v2 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v9, v8 ; GFX6789-NEXT: v_mov_b32_e32 v10, v8 ; GFX6789-NEXT: v_mov_b32_e32 v11, v8 ; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v7, v2 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v8 ; GFX6789-NEXT: v_mov_b32_e32 v1, v9 ; GFX6789-NEXT: v_mov_b32_e32 v2, v10 @@ -567,13 +569,13 @@ define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspa ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 -; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 -; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -681,13 +683,13 @@ define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace ; GFX6789-LABEL: load_cube_lwe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v8, 0 -; GFX6789-NEXT: v_mov_b32_e32 v7, v2 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v9, v8 ; GFX6789-NEXT: v_mov_b32_e32 v10, v8 ; GFX6789-NEXT: v_mov_b32_e32 v11, v8 ; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v7, v2 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v8 ; GFX6789-NEXT: v_mov_b32_e32 v1, v9 ; GFX6789-NEXT: v_mov_b32_e32 v2, v10 @@ -733,13 +735,13 @@ define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 -; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 -; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -839,12 +841,12 @@ define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, ptr addrsp ; GFX6789-LABEL: load_1darray_tfe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v7, 0 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v8, v7 ; GFX6789-NEXT: v_mov_b32_e32 v9, v7 ; GFX6789-NEXT: v_mov_b32_e32 v10, v7 ; GFX6789-NEXT: v_mov_b32_e32 v11, v7 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v7 ; GFX6789-NEXT: v_mov_b32_e32 v1, v8 ; GFX6789-NEXT: v_mov_b32_e32 v2, v9 @@ -888,13 +890,14 @@ define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, ptr addrsp ; GFX11-LABEL: load_1darray_tfe: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, v1 -; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: v_mov_b32_e32 v9, v7 +; GFX11-NEXT: v_mov_b32_e32 v2, v9 ; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v11, v7 -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[8:9] @@ -904,11 +907,12 @@ define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, ptr addrsp ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v7, 0 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v0 -; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v9, v7 -; GFX12-NEXT: v_dual_mov_b32 v10, v7 :: v_dual_mov_b32 v11, v7 +; GFX12-NEXT: v_mov_b32_e32 v9, v7 +; GFX12-NEXT: v_mov_b32_e32 v2, v9 +; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v11, v7 +; GFX12-NEXT: v_mov_b32_e32 v10, v7 ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX12-NEXT: v_mov_b32_e32 v4, v11 +; GFX12-NEXT: v_dual_mov_b32 v4, v11 :: v_dual_mov_b32 v3, v10 ; GFX12-NEXT: image_load v[0:4], [v6, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v7, v4, s[8:9] @@ -1002,13 +1006,13 @@ define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrsp ; GFX6789-LABEL: load_2darray_lwe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v8, 0 -; GFX6789-NEXT: v_mov_b32_e32 v7, v2 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v9, v8 ; GFX6789-NEXT: v_mov_b32_e32 v10, v8 ; GFX6789-NEXT: v_mov_b32_e32 v11, v8 ; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v7, v2 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v8 ; GFX6789-NEXT: v_mov_b32_e32 v1, v9 ; GFX6789-NEXT: v_mov_b32_e32 v2, v10 @@ -1054,13 +1058,13 @@ define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrsp ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 -; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 -; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -1162,13 +1166,13 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrsp ; GFX6789-LABEL: load_2dmsaa_both: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v8, 0 -; GFX6789-NEXT: v_mov_b32_e32 v7, v2 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v9, v8 ; GFX6789-NEXT: v_mov_b32_e32 v10, v8 ; GFX6789-NEXT: v_mov_b32_e32 v11, v8 ; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v7, v2 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v8 ; GFX6789-NEXT: v_mov_b32_e32 v1, v9 ; GFX6789-NEXT: v_mov_b32_e32 v2, v10 @@ -1214,13 +1218,13 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrsp ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 -; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 -; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -1330,14 +1334,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad ; GFX6789-LABEL: load_2darraymsaa_tfe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v9, 0 -; GFX6789-NEXT: v_mov_b32_e32 v8, v3 -; GFX6789-NEXT: v_mov_b32_e32 v7, v2 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v10, v9 ; GFX6789-NEXT: v_mov_b32_e32 v11, v9 ; GFX6789-NEXT: v_mov_b32_e32 v12, v9 ; GFX6789-NEXT: v_mov_b32_e32 v13, v9 +; GFX6789-NEXT: v_mov_b32_e32 v8, v3 +; GFX6789-NEXT: v_mov_b32_e32 v7, v2 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v9 ; GFX6789-NEXT: v_mov_b32_e32 v1, v10 ; GFX6789-NEXT: v_mov_b32_e32 v2, v11 @@ -1384,13 +1388,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, v3 ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, v9 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: v_mov_b32_e32 v11, v9 +; GFX11-NEXT: v_mov_b32_e32 v2, v11 ; GFX11-NEXT: v_mov_b32_e32 v12, v9 +; GFX11-NEXT: v_mov_b32_e32 v10, v9 ; GFX11-NEXT: v_mov_b32_e32 v13, v9 -; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 -; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 -; GFX11-NEXT: v_mov_b32_e32 v4, v13 +; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v3, v12 +; GFX11-NEXT: v_dual_mov_b32 v1, v10 :: v_dual_mov_b32 v4, v13 ; GFX11-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v9, v4, s[8:9] @@ -1401,11 +1406,12 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad ; GFX12-NEXT: v_mov_b32_e32 v9, 0 ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v6, v2 ; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v0 -; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v11, v9 -; GFX12-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v9 +; GFX12-NEXT: v_mov_b32_e32 v11, v9 +; GFX12-NEXT: v_mov_b32_e32 v2, v11 +; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v13, v9 +; GFX12-NEXT: v_mov_b32_e32 v12, v9 ; GFX12-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 -; GFX12-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 -; GFX12-NEXT: v_mov_b32_e32 v4, v13 +; GFX12-NEXT: v_dual_mov_b32 v4, v13 :: v_dual_mov_b32 v3, v12 ; GFX12-NEXT: image_load v[0:4], [v8, v7, v6, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v9, v4, s[8:9] @@ -1497,12 +1503,12 @@ define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspa ; GFX6789-LABEL: load_mip_1d_lwe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v7, 0 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v8, v7 ; GFX6789-NEXT: v_mov_b32_e32 v9, v7 ; GFX6789-NEXT: v_mov_b32_e32 v10, v7 ; GFX6789-NEXT: v_mov_b32_e32 v11, v7 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v7 ; GFX6789-NEXT: v_mov_b32_e32 v1, v8 ; GFX6789-NEXT: v_mov_b32_e32 v2, v9 @@ -1546,13 +1552,14 @@ define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspa ; GFX11-LABEL: load_mip_1d_lwe: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, v1 -; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: v_mov_b32_e32 v9, v7 +; GFX11-NEXT: v_mov_b32_e32 v2, v9 ; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v11, v7 -; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 -; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[8:9] @@ -1654,13 +1661,13 @@ define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspa ; GFX6789-LABEL: load_mip_2d_tfe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v8, 0 -; GFX6789-NEXT: v_mov_b32_e32 v7, v2 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v9, v8 ; GFX6789-NEXT: v_mov_b32_e32 v10, v8 ; GFX6789-NEXT: v_mov_b32_e32 v11, v8 ; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v7, v2 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v8 ; GFX6789-NEXT: v_mov_b32_e32 v1, v9 ; GFX6789-NEXT: v_mov_b32_e32 v2, v10 @@ -1706,13 +1713,13 @@ define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspa ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 -; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 -; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -2133,10 +2140,10 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, ptr a ; GFX6789-LABEL: load_1d_tfe_V4_dmask3: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v5, 0 -; GFX6789-NEXT: v_mov_b32_e32 v4, v0 ; GFX6789-NEXT: v_mov_b32_e32 v6, v5 ; GFX6789-NEXT: v_mov_b32_e32 v7, v5 ; GFX6789-NEXT: v_mov_b32_e32 v8, v5 +; GFX6789-NEXT: v_mov_b32_e32 v4, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v5 ; GFX6789-NEXT: v_mov_b32_e32 v1, v6 ; GFX6789-NEXT: v_mov_b32_e32 v2, v7 @@ -2176,11 +2183,12 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, ptr a ; GFX11-LABEL: load_1d_tfe_V4_dmask3: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, 0 -; GFX11-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-NEXT: v_mov_b32_e32 v7, v5 ; GFX11-NEXT: v_mov_b32_e32 v8, v5 -; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_mov_b32_e32 v1, v6 ; GFX11-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v5, v3, s[8:9] @@ -2190,9 +2198,9 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, ptr a ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, 0 ; GFX12-NEXT: v_dual_mov_b32 v6, v5 :: v_dual_mov_b32 v7, v5 -; GFX12-NEXT: v_mov_b32_e32 v8, v5 -; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 -; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v1, v6 +; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v3, v8 +; GFX12-NEXT: v_mov_b32_e32 v2, v7 ; GFX12-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v5, v3, s[8:9] @@ -2237,9 +2245,9 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, ptr a ; GFX6789-LABEL: load_1d_tfe_V4_dmask2: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v4, 0 -; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v5, v4 ; GFX6789-NEXT: v_mov_b32_e32 v6, v4 +; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v4 ; GFX6789-NEXT: v_mov_b32_e32 v1, v5 ; GFX6789-NEXT: v_mov_b32_e32 v2, v6 @@ -2276,10 +2284,10 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, ptr a ; GFX11-LABEL: load_1d_tfe_V4_dmask2: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v4 ; GFX11-NEXT: v_mov_b32_e32 v6, v4 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX11-NEXT: v_mov_b32_e32 v2, v6 +; GFX11-NEXT: v_mov_b32_e32 v5, v4 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: image_load v[0:2], v3, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v4, v2, s[8:9] @@ -2333,8 +2341,8 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, ptr a ; GFX6789-LABEL: load_1d_tfe_V4_dmask1: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v3, 0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v3 +; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v3 ; GFX6789-NEXT: v_mov_b32_e32 v1, v4 ; GFX6789-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe @@ -2422,8 +2430,8 @@ define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, ptr a ; GFX6789-LABEL: load_1d_tfe_V2_dmask1: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v3, 0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v3 +; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v3 ; GFX6789-NEXT: v_mov_b32_e32 v1, v4 ; GFX6789-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll index 3d64ef16a3c8c..f5c4d08bfe871 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll @@ -26,15 +26,15 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrsp ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x08,0x05] ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x06,0x07] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; encoding: [0x42,0x02,0x87,0xbf] -; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; encoding: [0x52,0x02,0x87,0xbf] ; GFX11-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] -; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 ; encoding: [0x08,0x01,0x10,0xca,0x09,0x01,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x93,0x01,0x87,0xbf] -; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 ; encoding: [0x0a,0x01,0x10,0xca,0x0b,0x01,0x02,0x02] -; GFX11-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 ; encoding: [0x0a,0x01,0x10,0xca,0x09,0x01,0x00,0x02] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; encoding: [0x03,0x00,0x87,0xbf] +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; encoding: [0x0b,0x01,0x10,0xca,0x0c,0x01,0x04,0x03] ; GFX11-NEXT: image_msaa_load v[0:4], v[5:7], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x98,0x02,0x60,0xf0,0x05,0x00,0x60,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x08,0x04,0x08,0x00] @@ -85,15 +85,16 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, v3 ; encoding: [0x80,0x00,0x10,0xca,0x03,0x01,0x08,0x09] ; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x06,0x07] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; encoding: [0x42,0x02,0x87,0xbf] -; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, v9 ; encoding: [0x00,0x01,0x10,0xca,0x09,0x01,0x0a,0x05] +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; encoding: [0x93,0x00,0x87,0xbf] ; GFX11-NEXT: v_mov_b32_e32 v11, v9 ; encoding: [0x09,0x03,0x16,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v2, v11 ; encoding: [0x0b,0x03,0x04,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v12, v9 ; encoding: [0x09,0x03,0x18,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v10, v9 ; encoding: [0x09,0x03,0x14,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v13, v9 ; encoding: [0x09,0x03,0x1a,0x7e] -; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 ; encoding: [0x09,0x01,0x10,0xca,0x0a,0x01,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x93,0x01,0x87,0xbf] -; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 ; encoding: [0x0b,0x01,0x10,0xca,0x0c,0x01,0x02,0x02] -; GFX11-NEXT: v_mov_b32_e32 v4, v13 ; encoding: [0x0d,0x03,0x08,0x7e] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x13,0x01,0x87,0xbf] +; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v3, v12 ; encoding: [0x09,0x01,0x10,0xca,0x0c,0x01,0x02,0x00] +; GFX11-NEXT: v_dual_mov_b32 v1, v10 :: v_dual_mov_b32 v4, v13 ; encoding: [0x0a,0x01,0x10,0xca,0x0d,0x01,0x04,0x01] ; GFX11-NEXT: image_msaa_load v[0:4], v[5:8], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x9c,0x08,0x60,0xf0,0x05,0x00,0x20,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: global_store_b32 v9, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x09,0x04,0x08,0x00] @@ -104,13 +105,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad ; GFX12-NEXT: v_mov_b32_e32 v9, 0 ; encoding: [0x80,0x02,0x12,0x7e] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v6, v2 ; encoding: [0x03,0x01,0x10,0xca,0x02,0x01,0x06,0x05] ; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v0 ; encoding: [0x01,0x01,0x10,0xca,0x00,0x01,0x08,0x07] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x23,0x01,0x87,0xbf] -; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v11, v9 ; encoding: [0x09,0x01,0x10,0xca,0x09,0x01,0x0a,0x0a] -; GFX12-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v9 ; encoding: [0x09,0x01,0x10,0xca,0x09,0x01,0x0c,0x0c] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; encoding: [0x93,0x00,0x87,0xbf] +; GFX12-NEXT: v_mov_b32_e32 v11, v9 ; encoding: [0x09,0x03,0x16,0x7e] +; GFX12-NEXT: v_mov_b32_e32 v2, v11 ; encoding: [0x0b,0x03,0x04,0x7e] +; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v13, v9 ; encoding: [0x09,0x01,0x10,0xca,0x09,0x01,0x0c,0x0a] +; GFX12-NEXT: v_mov_b32_e32 v12, v9 ; encoding: [0x09,0x03,0x18,0x7e] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] ; GFX12-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 ; encoding: [0x09,0x01,0x10,0xca,0x0a,0x01,0x00,0x00] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x92,0x01,0x87,0xbf] -; GFX12-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 ; encoding: [0x0b,0x01,0x10,0xca,0x0c,0x01,0x02,0x02] -; GFX12-NEXT: v_mov_b32_e32 v4, v13 ; encoding: [0x0d,0x03,0x08,0x7e] +; GFX12-NEXT: v_dual_mov_b32 v4, v13 :: v_dual_mov_b32 v3, v12 ; encoding: [0x0d,0x01,0x10,0xca,0x0c,0x01,0x02,0x04] ; GFX12-NEXT: image_msaa_load v[0:4], [v8, v7, v6, v5], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x0f,0x20,0x06,0xe6,0x00,0x00,0x00,0x00,0x08,0x07,0x06,0x05] ; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] ; GFX12-NEXT: global_store_b32 v9, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x09,0x00,0x00,0x00] @@ -196,12 +198,11 @@ define amdgpu_ps <4 x half> @load_2dmsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr addr ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v6, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x06,0x03] ; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x04,0x05] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x22,0x01,0x87,0xbf] -; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; encoding: [0x32,0x01,0x87,0xbf] ; GFX11-NEXT: v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e] -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 ; encoding: [0x06,0x01,0x10,0xca,0x07,0x01,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf] -; GFX11-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; encoding: [0x06,0x03,0x00,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7 ; encoding: [0x08,0x01,0x10,0xca,0x07,0x01,0x00,0x02] ; GFX11-NEXT: image_msaa_load v[0:2], v[3:5], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x98,0x01,0x62,0xf0,0x03,0x00,0x20,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: global_store_b32 v6, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x06,0x02,0x08,0x00] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll index 5a35c696c6e44..0f6bed26e7455 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -129,8 +129,7 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX11-TRUE16-NEXT: image_sample v[3:4], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -145,7 +144,7 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, v1 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v4 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4 ; GFX11-FAKE16-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX11-FAKE16-NEXT: image_sample v[0:1], v[2:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -158,8 +157,7 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX12-TRUE16-NEXT: image_sample v[3:4], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 @@ -174,7 +172,7 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v3, v0 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v4 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v0, v4 ; GFX12-FAKE16-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX12-FAKE16-NEXT: image_sample v[0:1], [v3, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll index 8b60aa0e48cda..59c059b2d48a3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -74,11 +74,11 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX6789-NEXT: s_mov_b64 s[14:15], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: v_mov_b32_e32 v6, 0 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v7, v6 ; GFX6789-NEXT: v_mov_b32_e32 v8, v6 ; GFX6789-NEXT: v_mov_b32_e32 v9, v6 ; GFX6789-NEXT: v_mov_b32_e32 v10, v6 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v6 ; GFX6789-NEXT: v_mov_b32_e32 v1, v7 ; GFX6789-NEXT: v_mov_b32_e32 v2, v8 @@ -117,13 +117,13 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX11-NEXT: s_mov_b32 s14, exec_lo ; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: v_mov_b32_e32 v10, v6 -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10 ; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX11-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -634,11 +634,11 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX6789-NEXT: s_mov_b64 s[14:15], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: v_mov_b32_e32 v6, 0 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v7, v6 ; GFX6789-NEXT: v_mov_b32_e32 v8, v6 ; GFX6789-NEXT: v_mov_b32_e32 v9, v6 ; GFX6789-NEXT: v_mov_b32_e32 v10, v6 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v6 ; GFX6789-NEXT: v_mov_b32_e32 v1, v7 ; GFX6789-NEXT: v_mov_b32_e32 v2, v8 @@ -677,13 +677,13 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX11-NEXT: s_mov_b32 s14, exec_lo ; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: v_mov_b32_e32 v10, v6 -; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9 -; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10 ; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX11-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll index 5a27a72de274d..8fafce03b90d1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll @@ -64,8 +64,8 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0 ; DAGISEL12-NEXT: s_mov_b64 exec, s[10:11] -; DAGISEL12-NEXT: v_mov_b32_e32 v11, s12 ; DAGISEL12-NEXT: v_add_nc_u32_e32 v10, 42, v13 +; DAGISEL12-NEXT: v_mov_b32_e32 v11, s12 ; DAGISEL12-NEXT: v_mov_b32_e32 v12, s13 ; DAGISEL12-NEXT: ; %bb.2: ; %tail ; DAGISEL12-NEXT: s_or_b64 exec, exec, s[8:9] @@ -111,8 +111,8 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; DAGISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11] ; DAGISEL10-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0 ; DAGISEL10-NEXT: s_mov_b64 exec, s[10:11] -; DAGISEL10-NEXT: v_mov_b32_e32 v11, s12 ; DAGISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v13 +; DAGISEL10-NEXT: v_mov_b32_e32 v11, s12 ; DAGISEL10-NEXT: v_mov_b32_e32 v12, s13 ; DAGISEL10-NEXT: ; %bb.2: ; %tail ; DAGISEL10-NEXT: s_or_b64 exec, exec, s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index 614566a230f68..d70867d58043f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; TODO: Run these for global isel as well. ; RUN: llc -mtriple=amdgcn -mcpu=gfx1013 < %s | FileCheck -check-prefixes=PRE-GFX12,GFX10,GFX1013 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=PRE-GFX12,GFX10,GFX1030 %s ; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1012 < %s 2>&1 | FileCheck -check-prefix=ERR %s @@ -10,6 +9,8 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s +; TODO: Run these for global isel as well. + ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) @@ -116,15 +117,15 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, f ; ; GFX12-GISEL-TRUE16-LABEL: image_bvh_intersect_ray_a16: ; GFX12-GISEL-TRUE16: ; %bb.0: ; %main_body -; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s20, s2 ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s22, s4 ; GFX12-GISEL-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s20, s2 ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s21, s3 ; GFX12-GISEL-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s8, s6 ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s5, s4 +; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 ; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21 ; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s4 -; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 ; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s16, s9 ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s17, s10 @@ -137,14 +138,14 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, f ; GFX12-GISEL-FAKE16-LABEL: image_bvh_intersect_ray_a16: ; GFX12-GISEL-FAKE16: ; %bb.0: ; %main_body ; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s20, s2 +; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s21, s3 ; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s22, s4 ; GFX12-GISEL-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s7, s5 -; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s21, s3 ; GFX12-GISEL-FAKE16-NEXT: s_pack_hh_b32_b16 s5, s7, s5 ; GFX12-GISEL-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s8, s6 +; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 ; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21 ; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s4 -; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 ; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 ; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s16, s9 ; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s17, s10 @@ -264,9 +265,9 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s22, s5 ; GFX12-GISEL-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s9, s7 ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s5, s4 -; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s4 ; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 ; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v1, s21 +; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s4 ; GFX12-GISEL-TRUE16-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v5, s6 ; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v4, s5 ; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s16, s10 @@ -281,13 +282,13 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, ; GFX12-GISEL-FAKE16: ; %bb.0: ; %main_body ; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s20, s3 ; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s21, s4 -; GFX12-GISEL-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s8, s6 ; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s22, s5 +; GFX12-GISEL-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s8, s6 ; GFX12-GISEL-FAKE16-NEXT: s_pack_hh_b32_b16 s5, s8, s6 ; GFX12-GISEL-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s9, s7 -; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s4 ; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 ; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v1, s21 +; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s4 ; GFX12-GISEL-FAKE16-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v5, s6 ; GFX12-GISEL-FAKE16-NEXT: v_mov_b32_e32 v4, s5 ; GFX12-GISEL-FAKE16-NEXT: s_mov_b32 s16, s10 @@ -417,22 +418,24 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX12-GISEL: ; %bb.0: ; %main_body ; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000 ; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x40a00000 ; GFX12-GISEL-NEXT: s_mov_b32 s9, 4.0 -; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000 -; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000 -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX12-GISEL-NEXT: s_mov_b32 s14, 0x41000000 ; GFX12-GISEL-NEXT: s_mov_b32 s13, 0x40e00000 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, s12 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13 +; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v8, s14 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v5, s10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v7, s13 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 2, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-GISEL-NEXT: s_mov_b32 s2, 2.0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -441,9 +444,9 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX12-GISEL-NEXT: flat_load_b32 v9, v[0:1] ; GFX12-GISEL-NEXT: flat_load_b32 v10, v[2:3] ; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s8 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, s9 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7] ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 @@ -572,15 +575,17 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x42004600 ; GFX12-GISEL-NEXT: s_mov_b32 s9, 0x44004700 ; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x45004800 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v5, s10 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v4, 2, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-GISEL-NEXT: s_mov_b32 s2, 2.0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -589,9 +594,9 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX12-GISEL-NEXT: flat_load_b32 v6, v[0:1] ; GFX12-GISEL-NEXT: flat_load_b32 v7, v[2:3] ; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s8 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, s9 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 @@ -732,29 +737,29 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_mov_b32 s5, 1.0 ; GFX12-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v9, 0xb36211c7 -; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000 -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000 ; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x40a00000 ; GFX12-GISEL-NEXT: s_mov_b32 s9, 4.0 +; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000 ; GFX12-GISEL-NEXT: s_mov_b32 s14, 0x41000000 ; GFX12-GISEL-NEXT: s_mov_b32 s13, 0x40e00000 +; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, s12 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v9, 0xb36211c7 ; GFX12-GISEL-NEXT: v_bfrev_b32_e32 v10, 4.0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v3, s8 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v5, s10 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s8 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-GISEL-NEXT: flat_load_b32 v11, v[0:1] -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-GISEL-NEXT: flat_load_b32 v11, v[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3] ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 @@ -886,25 +891,25 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_mov_b32 s5, 1.0 -; GFX12-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x42004600 +; GFX12-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX12-GISEL-NEXT: s_mov_b32 s9, 0x44004700 -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x45004800 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, 0xb36211c6 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v6, 0xb36211c6 :: v_dual_mov_b32 v5, s10 ; GFX12-GISEL-NEXT: v_bfrev_b32_e32 v7, 4.0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v3, s8 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-GISEL-NEXT: flat_load_b32 v8, v[0:1] -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-GISEL-NEXT: flat_load_b32 v8, v[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll index 193fbdf35ec74..078e6d3fd0078 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll @@ -315,8 +315,8 @@ define amdgpu_cs void @inverse_ballot_branch(i64 inreg %s0_1, i64 inreg %s2, ptr ; SDAG_W64-NEXT: ; %bb.1: ; %if ; SDAG_W64-NEXT: s_add_u32 s0, s0, 1 ; SDAG_W64-NEXT: s_addc_u32 s1, s1, 0 -; SDAG_W64-NEXT: v_mov_b32_e32 v3, s1 ; SDAG_W64-NEXT: v_mov_b32_e32 v2, s0 +; SDAG_W64-NEXT: v_mov_b32_e32 v3, s1 ; SDAG_W64-NEXT: ; %bb.2: ; %endif ; SDAG_W64-NEXT: s_or_b64 exec, exec, s[2:3] ; SDAG_W64-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -343,7 +343,7 @@ define amdgpu_cs void @inverse_ballot_branch(i64 inreg %s0_1, i64 inreg %s2, ptr ; SDAG_W32-NEXT: ; %bb.1: ; %if ; SDAG_W32-NEXT: s_add_u32 s0, s0, 1 ; SDAG_W32-NEXT: s_addc_u32 s1, s1, 0 -; SDAG_W32-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; SDAG_W32-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; SDAG_W32-NEXT: ; %bb.2: ; %endif ; SDAG_W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; SDAG_W32-NEXT: global_store_b64 v[0:1], v[2:3], off diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll index 679b289e13969..574c3c9c2d237 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll @@ -1306,8 +1306,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] @@ -1334,8 +1334,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] @@ -1627,17 +1627,17 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s2 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s3 +; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] @@ -1655,17 +1655,17 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s2 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s3 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] @@ -1750,8 +1750,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] @@ -1778,8 +1778,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll index 2fb677eccc4b3..6aec0859bf3ce 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll @@ -2257,10 +2257,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -2304,10 +2304,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -2467,10 +2467,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -2499,11 +2499,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -2532,10 +2532,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -2564,11 +2564,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -2771,10 +2771,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -2818,10 +2818,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -2982,10 +2982,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -3029,10 +3029,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -3193,10 +3193,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -3240,10 +3240,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -3404,10 +3404,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -3451,10 +3451,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s10 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s11 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s13 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 @@ -3614,10 +3614,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3646,11 +3646,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3679,10 +3679,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3711,11 +3711,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3917,10 +3917,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3949,11 +3949,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3982,10 +3982,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4014,11 +4014,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4220,10 +4220,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4252,11 +4252,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4285,10 +4285,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4317,11 +4317,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4523,10 +4523,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4555,11 +4555,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4588,10 +4588,10 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s18 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s19 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4620,11 +4620,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 ; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 ; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll index 13a96cfa6e650..147086a00cee8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s + ; FIXME: bfloat vector arguments are broken in globalisel. ; https://github.com/llvm/llvm-project/issues/77055 @@ -22,8 +23,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x ; GCN-NEXT: v_mov_b64_e32 v[8:9], s[24:25] ; GCN-NEXT: v_mov_b64_e32 v[10:11], s[26:27] ; GCN-NEXT: v_mov_b64_e32 v[12:13], s[28:29] -; GCN-NEXT: v_accvgpr_write_b32 a0, s8 ; GCN-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; GCN-NEXT: v_accvgpr_write_b32 a0, s8 ; GCN-NEXT: v_accvgpr_write_b32 a1, s9 ; GCN-NEXT: v_accvgpr_write_b32 a2, s10 ; GCN-NEXT: v_accvgpr_write_b32 a3, s11 @@ -94,8 +95,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0 ; GCN-NEXT: v_mov_b64_e32 v[8:9], s[24:25] ; GCN-NEXT: v_mov_b64_e32 v[10:11], s[26:27] ; GCN-NEXT: v_mov_b64_e32 v[12:13], s[28:29] -; GCN-NEXT: v_accvgpr_write_b32 a0, s8 ; GCN-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; GCN-NEXT: v_accvgpr_write_b32 a0, s8 ; GCN-NEXT: v_accvgpr_write_b32 a1, s9 ; GCN-NEXT: v_accvgpr_write_b32 a2, s10 ; GCN-NEXT: v_accvgpr_write_b32 a3, s11 @@ -248,55 +249,62 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GCN-NEXT: v_mov_b32_e32 v36, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[40:41], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[38:39], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[44:45], s[30:31] -; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GCN-NEXT: v_mov_b64_e32 v[42:43], s[28:29] -; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GCN-NEXT: v_mov_b32_e32 v32, s20 -; GCN-NEXT: v_mov_b32_e32 v33, s21 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[38:41], v[42:45], v[16:31] -; GCN-NEXT: v_mov_b32_e32 v34, s22 -; GCN-NEXT: v_mov_b32_e32 v35, s23 -; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 2 -; GCN-NEXT: v_mov_b32_e32 v16, s16 -; GCN-NEXT: v_mov_b32_e32 v17, s17 -; GCN-NEXT: v_mov_b32_e32 v18, s18 -; GCN-NEXT: v_mov_b32_e32 v19, s19 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[30:31] +; GCN-NEXT: v_accvgpr_write_b32 a31, s23 +; GCN-NEXT: v_accvgpr_write_b32 a30, s22 +; GCN-NEXT: v_accvgpr_write_b32 a29, s21 +; GCN-NEXT: v_accvgpr_write_b32 a28, s20 +; GCN-NEXT: v_accvgpr_write_b32 a27, s19 +; GCN-NEXT: v_accvgpr_write_b32 a26, s18 +; GCN-NEXT: v_accvgpr_write_b32 a25, s17 +; GCN-NEXT: v_accvgpr_write_b32 a24, s16 +; GCN-NEXT: v_accvgpr_write_b32 a23, s15 +; GCN-NEXT: v_accvgpr_write_b32 a22, s14 +; GCN-NEXT: v_accvgpr_write_b32 a21, s13 +; GCN-NEXT: v_accvgpr_write_b32 a20, s12 +; GCN-NEXT: v_accvgpr_write_b32 a19, s11 +; GCN-NEXT: v_accvgpr_write_b32 a18, s10 +; GCN-NEXT: v_accvgpr_write_b32 a17, s9 +; GCN-NEXT: v_accvgpr_write_b32 a16, s8 +; GCN-NEXT: v_mov_b32_e32 v10, s20 +; GCN-NEXT: v_mov_b32_e32 v11, s21 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[2:5], v[6:9], a[16:31] +; GCN-NEXT: v_mov_b32_e32 v12, s22 +; GCN-NEXT: v_mov_b32_e32 v13, s23 +; GCN-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NEXT: v_mov_b32_e32 v4, s18 +; GCN-NEXT: v_mov_b32_e32 v5, s19 +; GCN-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v16, s12 -; GCN-NEXT: v_mov_b32_e32 v17, s13 -; GCN-NEXT: v_mov_b32_e32 v18, s14 -; GCN-NEXT: v_mov_b32_e32 v19, s15 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NEXT: v_mov_b32_e32 v4, s14 +; GCN-NEXT: v_mov_b32_e32 v5, s15 +; GCN-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v16, s8 -; GCN-NEXT: v_mov_b32_e32 v17, s9 -; GCN-NEXT: v_mov_b32_e32 v18, s10 -; GCN-NEXT: v_mov_b32_e32 v19, s11 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) @@ -311,55 +319,62 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GCN-NEXT: v_mov_b32_e32 v36, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[40:41], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[38:39], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[44:45], s[30:31] -; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GCN-NEXT: v_mov_b64_e32 v[42:43], s[28:29] -; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GCN-NEXT: v_mov_b32_e32 v32, s20 -; GCN-NEXT: v_mov_b32_e32 v33, s21 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v34, s22 -; GCN-NEXT: v_mov_b32_e32 v35, s23 -; GCN-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 2 -; GCN-NEXT: v_mov_b32_e32 v16, s16 -; GCN-NEXT: v_mov_b32_e32 v17, s17 -; GCN-NEXT: v_mov_b32_e32 v18, s18 -; GCN-NEXT: v_mov_b32_e32 v19, s19 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[30:31] +; GCN-NEXT: v_accvgpr_write_b32 a31, s23 +; GCN-NEXT: v_accvgpr_write_b32 a30, s22 +; GCN-NEXT: v_accvgpr_write_b32 a29, s21 +; GCN-NEXT: v_accvgpr_write_b32 a28, s20 +; GCN-NEXT: v_accvgpr_write_b32 a27, s19 +; GCN-NEXT: v_accvgpr_write_b32 a26, s18 +; GCN-NEXT: v_accvgpr_write_b32 a25, s17 +; GCN-NEXT: v_accvgpr_write_b32 a24, s16 +; GCN-NEXT: v_accvgpr_write_b32 a23, s15 +; GCN-NEXT: v_accvgpr_write_b32 a22, s14 +; GCN-NEXT: v_accvgpr_write_b32 a21, s13 +; GCN-NEXT: v_accvgpr_write_b32 a20, s12 +; GCN-NEXT: v_accvgpr_write_b32 a19, s11 +; GCN-NEXT: v_accvgpr_write_b32 a18, s10 +; GCN-NEXT: v_accvgpr_write_b32 a17, s9 +; GCN-NEXT: v_accvgpr_write_b32 a16, s8 +; GCN-NEXT: v_mov_b32_e32 v10, s20 +; GCN-NEXT: v_mov_b32_e32 v11, s21 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v12, s22 +; GCN-NEXT: v_mov_b32_e32 v13, s23 +; GCN-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NEXT: v_mov_b32_e32 v4, s18 +; GCN-NEXT: v_mov_b32_e32 v5, s19 +; GCN-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v16, s12 -; GCN-NEXT: v_mov_b32_e32 v17, s13 -; GCN-NEXT: v_mov_b32_e32 v18, s14 -; GCN-NEXT: v_mov_b32_e32 v19, s15 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NEXT: v_mov_b32_e32 v4, s14 +; GCN-NEXT: v_mov_b32_e32 v5, s15 +; GCN-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v16, s8 -; GCN-NEXT: v_mov_b32_e32 v17, s9 -; GCN-NEXT: v_mov_b32_e32 v18, s10 -; GCN-NEXT: v_mov_b32_e32 v19, s11 -; GCN-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3) @@ -375,26 +390,34 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat> ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GCN-NEXT: v_accvgpr_write_b32 a0, s8 +; GCN-NEXT: v_accvgpr_write_b32 a1, s9 +; GCN-NEXT: v_accvgpr_write_b32 a2, s10 +; GCN-NEXT: v_accvgpr_write_b32 a3, s11 +; GCN-NEXT: v_accvgpr_write_b32 a4, s12 +; GCN-NEXT: v_accvgpr_write_b32 a5, s13 +; GCN-NEXT: v_accvgpr_write_b32 a6, s14 +; GCN-NEXT: v_accvgpr_write_b32 a7, s15 +; GCN-NEXT: v_accvgpr_write_b32 a8, s16 +; GCN-NEXT: v_accvgpr_write_b32 a9, s17 +; GCN-NEXT: v_accvgpr_write_b32 a10, s18 +; GCN-NEXT: v_accvgpr_write_b32 a11, s19 +; GCN-NEXT: v_accvgpr_write_b32 a12, s20 +; GCN-NEXT: v_accvgpr_write_b32 a13, s21 +; GCN-NEXT: v_accvgpr_write_b32 a14, s22 +; GCN-NEXT: v_accvgpr_write_b32 a15, s23 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] -; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 10 -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) store <16 x float> %result, ptr addrspace(1) %out @@ -408,31 +431,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GCN-NEXT: v_accvgpr_write_b32 a0, s8 +; GCN-NEXT: v_accvgpr_write_b32 a1, s9 +; GCN-NEXT: v_accvgpr_write_b32 a2, s10 +; GCN-NEXT: v_accvgpr_write_b32 a3, s11 +; GCN-NEXT: v_accvgpr_write_b32 a4, s12 +; GCN-NEXT: v_accvgpr_write_b32 a5, s13 +; GCN-NEXT: v_accvgpr_write_b32 a6, s14 +; GCN-NEXT: v_accvgpr_write_b32 a7, s15 +; GCN-NEXT: v_accvgpr_write_b32 a8, s16 +; GCN-NEXT: v_accvgpr_write_b32 a9, s17 +; GCN-NEXT: v_accvgpr_write_b32 a10, s18 +; GCN-NEXT: v_accvgpr_write_b32 a11, s19 +; GCN-NEXT: v_accvgpr_write_b32 a12, s20 +; GCN-NEXT: v_accvgpr_write_b32 a13, s21 +; GCN-NEXT: v_accvgpr_write_b32 a14, s22 +; GCN-NEXT: v_accvgpr_write_b32 a15, s23 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 -; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 10 -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1) store <16 x float> %result, ptr addrspace(1) %out ret void } -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } attributes #1 = { "amdgpu-flat-work-group-size"="1,64" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index ab0000f6831b6..c64845e35fe51 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -4,6 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 --amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=HEURRC %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 --amdgpu-mfma-vgpr-form=1 < %s | FileCheck -enable-var-scope --check-prefixes=VGPRRC %s + declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg) @@ -141,18 +142,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[14:15] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[2:5], v[6:9], a[0:3] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -164,14 +167,16 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] -; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -179,18 +184,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[14:15] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[2:5], v[6:9], a[0:3] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -203,8 +210,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] @@ -260,18 +267,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[14:15] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -283,14 +292,16 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -298,18 +309,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[14:15] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -322,8 +335,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 @@ -389,8 +402,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[24:25] ; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[26:27] ; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[28:29] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 ; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 @@ -455,8 +468,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -472,8 +485,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] @@ -514,8 +527,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[24:25] ; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[26:27] ; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[28:29] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 ; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 ; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 ; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 ; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 @@ -580,8 +593,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[26:27] ; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[24:25] ; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], s[30:31] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[28:29] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] @@ -770,8 +783,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[24:25] ; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[26:27] ; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[28:29] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 ; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 @@ -836,8 +849,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -853,8 +866,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] @@ -895,8 +908,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[24:25] ; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[26:27] ; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[28:29] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 ; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[30:31] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 ; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 ; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 ; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 @@ -961,8 +974,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[26:27] ; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[24:25] ; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], s[30:31] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[28:29] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] @@ -1485,55 +1498,62 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v36, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[40:41], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[44:45], s[30:31] -; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; SDAG-NEXT: v_mov_b64_e32 v[42:43], s[28:29] -; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 +; SDAG-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[2:5], v[6:9], a[16:31] +; SDAG-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-NEXT: v_mov_b32_e32 v2, s16 +; SDAG-NEXT: v_mov_b32_e32 v3, s17 +; SDAG-NEXT: v_mov_b32_e32 v4, s18 +; SDAG-NEXT: v_mov_b32_e32 v5, s19 +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -1542,44 +1562,52 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v56, 0 +; GISEL-NEXT: v_mov_b32_e32 v24, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] -; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -1588,55 +1616,62 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v36, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[40:41], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[44:45], s[30:31] -; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; HEURRC-NEXT: v_mov_b64_e32 v[42:43], s[28:29] -; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; HEURRC-NEXT: v_mov_b32_e32 v32, s20 -; HEURRC-NEXT: v_mov_b32_e32 v33, s21 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] -; HEURRC-NEXT: v_mov_b32_e32 v34, s22 -; HEURRC-NEXT: v_mov_b32_e32 v35, s23 -; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[30:31] +; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 +; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v10, s20 +; HEURRC-NEXT: v_mov_b32_e32 v11, s21 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[2:5], v[6:9], a[16:31] +; HEURRC-NEXT: v_mov_b32_e32 v12, s22 +; HEURRC-NEXT: v_mov_b32_e32 v13, s23 +; HEURRC-NEXT: v_mov_b32_e32 v2, s16 +; HEURRC-NEXT: v_mov_b32_e32 v3, s17 +; HEURRC-NEXT: v_mov_b32_e32 v4, s18 +; HEURRC-NEXT: v_mov_b32_e32 v5, s19 +; HEURRC-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: v_mov_b32_e32 v16, s16 -; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mov_b32_e32 v18, s18 -; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s12 -; HEURRC-NEXT: v_mov_b32_e32 v17, s13 -; HEURRC-NEXT: v_mov_b32_e32 v18, s14 -; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s12 +; HEURRC-NEXT: v_mov_b32_e32 v3, s13 +; HEURRC-NEXT: v_mov_b32_e32 v4, s14 +; HEURRC-NEXT: v_mov_b32_e32 v5, s15 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v17, s9 -; HEURRC-NEXT: v_mov_b32_e32 v18, s10 -; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s8 +; HEURRC-NEXT: v_mov_b32_e32 v3, s9 +; HEURRC-NEXT: v_mov_b32_e32 v4, s10 +; HEURRC-NEXT: v_mov_b32_e32 v5, s11 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -1645,13 +1680,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v36, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v32, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[26:27] -; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[24:25] -; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[26:27] +; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[24:25] +; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[28:29] ; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[28:29] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] ; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] ; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] @@ -1659,41 +1694,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 -; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] -; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v42, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v43, s21 +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[34:37], v[38:41], v[16:31] +; VGPRRC-NEXT: v_mov_b32_e32 v44, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v45, s23 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[42:45], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 2 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd: @@ -1827,55 +1862,62 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v36, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[40:41], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[44:45], s[30:31] -; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; SDAG-NEXT: v_mov_b64_e32 v[42:43], s[28:29] -; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 -; SDAG-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 +; SDAG-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3 +; SDAG-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-NEXT: v_mov_b32_e32 v13, s23 +; SDAG-NEXT: v_mov_b32_e32 v2, s16 +; SDAG-NEXT: v_mov_b32_e32 v3, s17 +; SDAG-NEXT: v_mov_b32_e32 v4, s18 +; SDAG-NEXT: v_mov_b32_e32 v5, s19 +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -1884,44 +1926,52 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v56, 0 +; GISEL-NEXT: v_mov_b32_e32 v24, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3 -; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3 +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -1930,55 +1980,62 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v36, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[40:41], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[44:45], s[30:31] -; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; HEURRC-NEXT: v_mov_b64_e32 v[42:43], s[28:29] -; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; HEURRC-NEXT: v_mov_b32_e32 v32, s20 -; HEURRC-NEXT: v_mov_b32_e32 v33, s21 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3 -; HEURRC-NEXT: v_mov_b32_e32 v34, s22 -; HEURRC-NEXT: v_mov_b32_e32 v35, s23 -; HEURRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[30:31] +; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 +; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v10, s20 +; HEURRC-NEXT: v_mov_b32_e32 v11, s21 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3 +; HEURRC-NEXT: v_mov_b32_e32 v12, s22 +; HEURRC-NEXT: v_mov_b32_e32 v13, s23 +; HEURRC-NEXT: v_mov_b32_e32 v2, s16 +; HEURRC-NEXT: v_mov_b32_e32 v3, s17 +; HEURRC-NEXT: v_mov_b32_e32 v4, s18 +; HEURRC-NEXT: v_mov_b32_e32 v5, s19 +; HEURRC-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: v_mov_b32_e32 v16, s16 -; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mov_b32_e32 v18, s18 -; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s12 -; HEURRC-NEXT: v_mov_b32_e32 v17, s13 -; HEURRC-NEXT: v_mov_b32_e32 v18, s14 -; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s12 +; HEURRC-NEXT: v_mov_b32_e32 v3, s13 +; HEURRC-NEXT: v_mov_b32_e32 v4, s14 +; HEURRC-NEXT: v_mov_b32_e32 v5, s15 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v17, s9 -; HEURRC-NEXT: v_mov_b32_e32 v18, s10 -; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s8 +; HEURRC-NEXT: v_mov_b32_e32 v3, s9 +; HEURRC-NEXT: v_mov_b32_e32 v4, s10 +; HEURRC-NEXT: v_mov_b32_e32 v5, s11 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -1987,13 +2044,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v36, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v32, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[26:27] -; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[24:25] -; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[26:27] +; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[24:25] +; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[28:29] ; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], s[28:29] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] ; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] ; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] @@ -2001,41 +2058,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 -; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3 -; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v42, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v43, s21 +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[34:37], v[38:41], v[16:31] cbsz:1 abid:2 blgp:3 +; VGPRRC-NEXT: v_mov_b32_e32 v44, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v45, s23 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[42:45], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 2 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags: @@ -2170,26 +2227,34 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 10 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: @@ -2198,26 +2263,34 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] -; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 10 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: @@ -2226,26 +2299,34 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] -; HEURRC-NEXT: v_mov_b32_e32 v16, 0 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_nop 10 -; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: @@ -2257,8 +2338,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25] ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27] ; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] @@ -2351,26 +2432,34 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 10 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: @@ -2379,67 +2468,83 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: s_nop 10 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; GISEL-NEXT: s_endpgm -; -; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: -; HEURRC: ; %bb.0: -; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 -; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 -; HEURRC-NEXT: v_mov_b32_e32 v16, 0 -; HEURRC-NEXT: s_nop 10 -; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] -; HEURRC-NEXT: s_endpgm -; -; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: -; VGPRRC: ; %bb.0: -; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 -; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_nop 10 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GISEL-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: s_nop 10 +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25] ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27] ; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] @@ -2661,24 +2766,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_mov_b32_e32 v8, s0 -; SDAG-NEXT: v_mov_b32_e32 v9, s1 -; SDAG-NEXT: v_mov_b32_e32 v10, s2 -; SDAG-NEXT: v_mov_b32_e32 v11, s3 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] +; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: @@ -2690,14 +2795,16 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] -; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: @@ -2705,24 +2812,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; HEURRC-NEXT: v_mov_b32_e32 v12, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v0, s8 -; HEURRC-NEXT: v_mov_b32_e32 v1, s9 -; HEURRC-NEXT: v_mov_b32_e32 v2, s10 -; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: v_mov_b32_e32 v4, s12 -; HEURRC-NEXT: v_mov_b32_e32 v5, s13 -; HEURRC-NEXT: v_mov_b32_e32 v6, s14 -; HEURRC-NEXT: v_mov_b32_e32 v7, s15 -; HEURRC-NEXT: v_mov_b32_e32 v8, s0 -; HEURRC-NEXT: v_mov_b32_e32 v9, s1 -; HEURRC-NEXT: v_mov_b32_e32 v10, s2 -; HEURRC-NEXT: v_mov_b32_e32 v11, s3 +; HEURRC-NEXT: v_mov_b32_e32 v2, s8 +; HEURRC-NEXT: v_mov_b32_e32 v3, s9 +; HEURRC-NEXT: v_mov_b32_e32 v4, s10 +; HEURRC-NEXT: v_mov_b32_e32 v5, s11 +; HEURRC-NEXT: v_mov_b32_e32 v6, s12 +; HEURRC-NEXT: v_mov_b32_e32 v7, s13 +; HEURRC-NEXT: v_mov_b32_e32 v8, s14 +; HEURRC-NEXT: v_mov_b32_e32 v9, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] +; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: @@ -2730,24 +2837,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 -; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 -; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 -; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 -; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 +; VGPRRC-NEXT: v_mov_b32_e32 v6, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v7, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v8, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v9, s11 +; VGPRRC-NEXT: v_mov_b32_e32 v10, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v11, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v12, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v13, s15 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s0 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s1 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s2 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s3 ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[6:9], v[10:13], v[0:3] ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: ; AGPR: ; %bb.0: @@ -2808,24 +2915,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_mov_b32_e32 v8, s0 -; SDAG-NEXT: v_mov_b32_e32 v9, s1 -; SDAG-NEXT: v_mov_b32_e32 v10, s2 -; SDAG-NEXT: v_mov_b32_e32 v11, s3 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: @@ -2837,14 +2944,16 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: @@ -2852,24 +2961,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; HEURRC-NEXT: v_mov_b32_e32 v12, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v0, s8 -; HEURRC-NEXT: v_mov_b32_e32 v1, s9 -; HEURRC-NEXT: v_mov_b32_e32 v2, s10 -; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: v_mov_b32_e32 v4, s12 -; HEURRC-NEXT: v_mov_b32_e32 v5, s13 -; HEURRC-NEXT: v_mov_b32_e32 v6, s14 -; HEURRC-NEXT: v_mov_b32_e32 v7, s15 -; HEURRC-NEXT: v_mov_b32_e32 v8, s0 -; HEURRC-NEXT: v_mov_b32_e32 v9, s1 -; HEURRC-NEXT: v_mov_b32_e32 v10, s2 -; HEURRC-NEXT: v_mov_b32_e32 v11, s3 +; HEURRC-NEXT: v_mov_b32_e32 v2, s8 +; HEURRC-NEXT: v_mov_b32_e32 v3, s9 +; HEURRC-NEXT: v_mov_b32_e32 v4, s10 +; HEURRC-NEXT: v_mov_b32_e32 v5, s11 +; HEURRC-NEXT: v_mov_b32_e32 v6, s12 +; HEURRC-NEXT: v_mov_b32_e32 v7, s13 +; HEURRC-NEXT: v_mov_b32_e32 v8, s14 +; HEURRC-NEXT: v_mov_b32_e32 v9, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: @@ -2877,24 +2986,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 -; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 -; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 -; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 -; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 -; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 +; VGPRRC-NEXT: v_mov_b32_e32 v6, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v7, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v8, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v9, s11 +; VGPRRC-NEXT: v_mov_b32_e32 v10, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v11, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v12, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v13, s15 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s0 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s1 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s2 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s3 ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: ; AGPR: ; %bb.0: @@ -2967,11 +3076,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; SDAG-NEXT: v_mov_b32_e32 v5, s25 ; SDAG-NEXT: v_mov_b32_e32 v6, s26 ; SDAG-NEXT: v_mov_b32_e32 v7, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 ; SDAG-NEXT: v_mov_b32_e32 v8, s28 ; SDAG-NEXT: v_mov_b32_e32 v9, s29 ; SDAG-NEXT: v_mov_b32_e32 v10, s30 ; SDAG-NEXT: v_mov_b32_e32 v11, s31 +; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 ; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 ; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 ; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 @@ -3038,8 +3147,8 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -3055,8 +3164,8 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] @@ -3097,11 +3206,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; HEURRC-NEXT: v_mov_b32_e32 v5, s25 ; HEURRC-NEXT: v_mov_b32_e32 v6, s26 ; HEURRC-NEXT: v_mov_b32_e32 v7, s27 -; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 ; HEURRC-NEXT: v_mov_b32_e32 v8, s28 ; HEURRC-NEXT: v_mov_b32_e32 v9, s29 ; HEURRC-NEXT: v_mov_b32_e32 v10, s30 ; HEURRC-NEXT: v_mov_b32_e32 v11, s31 +; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 ; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 ; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 ; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 @@ -3168,11 +3277,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; VGPRRC-NEXT: v_mov_b32_e32 v37, s25 ; VGPRRC-NEXT: v_mov_b32_e32 v38, s26 ; VGPRRC-NEXT: v_mov_b32_e32 v39, s27 -; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] ; VGPRRC-NEXT: v_mov_b32_e32 v40, s28 ; VGPRRC-NEXT: v_mov_b32_e32 v41, s29 ; VGPRRC-NEXT: v_mov_b32_e32 v42, s30 ; VGPRRC-NEXT: v_mov_b32_e32 v43, s31 +; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] ; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] ; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] @@ -3379,11 +3488,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; SDAG-NEXT: v_mov_b32_e32 v5, s25 ; SDAG-NEXT: v_mov_b32_e32 v6, s26 ; SDAG-NEXT: v_mov_b32_e32 v7, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 ; SDAG-NEXT: v_mov_b32_e32 v8, s28 ; SDAG-NEXT: v_mov_b32_e32 v9, s29 ; SDAG-NEXT: v_mov_b32_e32 v10, s30 ; SDAG-NEXT: v_mov_b32_e32 v11, s31 +; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 ; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 ; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 ; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 @@ -3450,8 +3559,8 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -3467,8 +3576,8 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 ; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 ; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] @@ -3509,11 +3618,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; HEURRC-NEXT: v_mov_b32_e32 v5, s25 ; HEURRC-NEXT: v_mov_b32_e32 v6, s26 ; HEURRC-NEXT: v_mov_b32_e32 v7, s27 -; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 ; HEURRC-NEXT: v_mov_b32_e32 v8, s28 ; HEURRC-NEXT: v_mov_b32_e32 v9, s29 ; HEURRC-NEXT: v_mov_b32_e32 v10, s30 ; HEURRC-NEXT: v_mov_b32_e32 v11, s31 +; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 ; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 ; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 ; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 @@ -3580,11 +3689,11 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: v_mov_b32_e32 v37, s25 ; VGPRRC-NEXT: v_mov_b32_e32 v38, s26 ; VGPRRC-NEXT: v_mov_b32_e32 v39, s27 -; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] ; VGPRRC-NEXT: v_mov_b32_e32 v40, s28 ; VGPRRC-NEXT: v_mov_b32_e32 v41, s29 ; VGPRRC-NEXT: v_mov_b32_e32 v42, s30 ; VGPRRC-NEXT: v_mov_b32_e32 v43, s31 +; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] ; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] ; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] @@ -4124,63 +4233,70 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v40, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v36, s24 -; SDAG-NEXT: v_mov_b32_e32 v37, s25 -; SDAG-NEXT: v_mov_b32_e32 v38, s26 -; SDAG-NEXT: v_mov_b32_e32 v39, s27 +; SDAG-NEXT: v_mov_b32_e32 v6, s24 +; SDAG-NEXT: v_mov_b32_e32 v7, s25 +; SDAG-NEXT: v_mov_b32_e32 v8, s26 +; SDAG-NEXT: v_mov_b32_e32 v9, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s16 +; SDAG-NEXT: v_mov_b32_e32 v3, s17 +; SDAG-NEXT: v_mov_b32_e32 v4, s18 +; SDAG-NEXT: v_mov_b32_e32 v5, s19 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -4189,44 +4305,52 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v56, 0 +; GISEL-NEXT: v_mov_b32_e32 v24, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] -; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -4234,63 +4358,70 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v40, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v32, s20 -; HEURRC-NEXT: v_mov_b32_e32 v33, s21 -; HEURRC-NEXT: v_mov_b32_e32 v34, s22 -; HEURRC-NEXT: v_mov_b32_e32 v35, s23 +; HEURRC-NEXT: v_mov_b32_e32 v2, s20 +; HEURRC-NEXT: v_mov_b32_e32 v3, s21 +; HEURRC-NEXT: v_mov_b32_e32 v4, s22 +; HEURRC-NEXT: v_mov_b32_e32 v5, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v36, s24 -; HEURRC-NEXT: v_mov_b32_e32 v37, s25 -; HEURRC-NEXT: v_mov_b32_e32 v38, s26 -; HEURRC-NEXT: v_mov_b32_e32 v39, s27 +; HEURRC-NEXT: v_mov_b32_e32 v6, s24 +; HEURRC-NEXT: v_mov_b32_e32 v7, s25 +; HEURRC-NEXT: v_mov_b32_e32 v8, s26 +; HEURRC-NEXT: v_mov_b32_e32 v9, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 +; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] -; HEURRC-NEXT: s_nop 6 -; HEURRC-NEXT: v_mov_b32_e32 v16, s20 -; HEURRC-NEXT: v_mov_b32_e32 v17, s21 -; HEURRC-NEXT: v_mov_b32_e32 v18, s22 -; HEURRC-NEXT: v_mov_b32_e32 v19, s23 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] +; HEURRC-NEXT: v_mov_b32_e32 v2, s20 +; HEURRC-NEXT: v_mov_b32_e32 v3, s21 +; HEURRC-NEXT: v_mov_b32_e32 v4, s22 +; HEURRC-NEXT: v_mov_b32_e32 v5, s23 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s16 -; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mov_b32_e32 v18, s18 -; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s16 +; HEURRC-NEXT: v_mov_b32_e32 v3, s17 +; HEURRC-NEXT: v_mov_b32_e32 v4, s18 +; HEURRC-NEXT: v_mov_b32_e32 v5, s19 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s12 -; HEURRC-NEXT: v_mov_b32_e32 v17, s13 -; HEURRC-NEXT: v_mov_b32_e32 v18, s14 -; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s12 +; HEURRC-NEXT: v_mov_b32_e32 v3, s13 +; HEURRC-NEXT: v_mov_b32_e32 v4, s14 +; HEURRC-NEXT: v_mov_b32_e32 v5, s15 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v17, s9 -; HEURRC-NEXT: v_mov_b32_e32 v18, s10 -; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s8 +; HEURRC-NEXT: v_mov_b32_e32 v3, s9 +; HEURRC-NEXT: v_mov_b32_e32 v4, s10 +; HEURRC-NEXT: v_mov_b32_e32 v5, s11 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -4298,17 +4429,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v40, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v32, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 -; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v36, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v37, s23 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VGPRRC-NEXT: v_mov_b32_e32 v36, s24 -; VGPRRC-NEXT: v_mov_b32_e32 v37, s25 -; VGPRRC-NEXT: v_mov_b32_e32 v38, s26 -; VGPRRC-NEXT: v_mov_b32_e32 v39, s27 +; VGPRRC-NEXT: v_mov_b32_e32 v38, s24 +; VGPRRC-NEXT: v_mov_b32_e32 v39, s25 +; VGPRRC-NEXT: v_mov_b32_e32 v40, s26 +; VGPRRC-NEXT: v_mov_b32_e32 v41, s27 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) ; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] @@ -4319,42 +4450,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] +; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[34:37], v[38:41], v[16:31] ; VGPRRC-NEXT: s_nop 6 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd: @@ -4501,63 +4632,70 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v40, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v32, s20 -; SDAG-NEXT: v_mov_b32_e32 v33, s21 -; SDAG-NEXT: v_mov_b32_e32 v34, s22 -; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v36, s24 -; SDAG-NEXT: v_mov_b32_e32 v37, s25 -; SDAG-NEXT: v_mov_b32_e32 v38, s26 -; SDAG-NEXT: v_mov_b32_e32 v39, s27 +; SDAG-NEXT: v_mov_b32_e32 v6, s24 +; SDAG-NEXT: v_mov_b32_e32 v7, s25 +; SDAG-NEXT: v_mov_b32_e32 v8, s26 +; SDAG-NEXT: v_mov_b32_e32 v9, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3 +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s16 +; SDAG-NEXT: v_mov_b32_e32 v3, s17 +; SDAG-NEXT: v_mov_b32_e32 v4, s18 +; SDAG-NEXT: v_mov_b32_e32 v5, s19 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -4566,44 +4704,52 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v56, 0 +; GISEL-NEXT: v_mov_b32_e32 v24, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3 -; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3 +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -4611,63 +4757,70 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v40, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v32, s20 -; HEURRC-NEXT: v_mov_b32_e32 v33, s21 -; HEURRC-NEXT: v_mov_b32_e32 v34, s22 -; HEURRC-NEXT: v_mov_b32_e32 v35, s23 +; HEURRC-NEXT: v_mov_b32_e32 v2, s20 +; HEURRC-NEXT: v_mov_b32_e32 v3, s21 +; HEURRC-NEXT: v_mov_b32_e32 v4, s22 +; HEURRC-NEXT: v_mov_b32_e32 v5, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v36, s24 -; HEURRC-NEXT: v_mov_b32_e32 v37, s25 -; HEURRC-NEXT: v_mov_b32_e32 v38, s26 -; HEURRC-NEXT: v_mov_b32_e32 v39, s27 +; HEURRC-NEXT: v_mov_b32_e32 v6, s24 +; HEURRC-NEXT: v_mov_b32_e32 v7, s25 +; HEURRC-NEXT: v_mov_b32_e32 v8, s26 +; HEURRC-NEXT: v_mov_b32_e32 v9, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 +; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; HEURRC-NEXT: s_nop 6 -; HEURRC-NEXT: v_mov_b32_e32 v16, s20 -; HEURRC-NEXT: v_mov_b32_e32 v17, s21 -; HEURRC-NEXT: v_mov_b32_e32 v18, s22 -; HEURRC-NEXT: v_mov_b32_e32 v19, s23 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3 +; HEURRC-NEXT: v_mov_b32_e32 v2, s20 +; HEURRC-NEXT: v_mov_b32_e32 v3, s21 +; HEURRC-NEXT: v_mov_b32_e32 v4, s22 +; HEURRC-NEXT: v_mov_b32_e32 v5, s23 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s16 -; HEURRC-NEXT: v_mov_b32_e32 v17, s17 -; HEURRC-NEXT: v_mov_b32_e32 v18, s18 -; HEURRC-NEXT: v_mov_b32_e32 v19, s19 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s16 +; HEURRC-NEXT: v_mov_b32_e32 v3, s17 +; HEURRC-NEXT: v_mov_b32_e32 v4, s18 +; HEURRC-NEXT: v_mov_b32_e32 v5, s19 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s12 -; HEURRC-NEXT: v_mov_b32_e32 v17, s13 -; HEURRC-NEXT: v_mov_b32_e32 v18, s14 -; HEURRC-NEXT: v_mov_b32_e32 v19, s15 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s12 +; HEURRC-NEXT: v_mov_b32_e32 v3, s13 +; HEURRC-NEXT: v_mov_b32_e32 v4, s14 +; HEURRC-NEXT: v_mov_b32_e32 v5, s15 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v17, s9 -; HEURRC-NEXT: v_mov_b32_e32 v18, s10 -; HEURRC-NEXT: v_mov_b32_e32 v19, s11 -; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v2, s8 +; HEURRC-NEXT: v_mov_b32_e32 v3, s9 +; HEURRC-NEXT: v_mov_b32_e32 v4, s10 +; HEURRC-NEXT: v_mov_b32_e32 v5, s11 +; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -4675,17 +4828,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v40, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v32, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 -; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v36, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v37, s23 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VGPRRC-NEXT: v_mov_b32_e32 v36, s24 -; VGPRRC-NEXT: v_mov_b32_e32 v37, s25 -; VGPRRC-NEXT: v_mov_b32_e32 v38, s26 -; VGPRRC-NEXT: v_mov_b32_e32 v39, s27 +; VGPRRC-NEXT: v_mov_b32_e32 v38, s24 +; VGPRRC-NEXT: v_mov_b32_e32 v39, s25 +; VGPRRC-NEXT: v_mov_b32_e32 v40, s26 +; VGPRRC-NEXT: v_mov_b32_e32 v41, s27 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) ; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] @@ -4696,42 +4849,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[34:37], v[38:41], v[16:31] cbsz:1 abid:2 blgp:3 ; VGPRRC-NEXT: s_nop 6 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags: @@ -4879,32 +5032,40 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v20, s24 -; SDAG-NEXT: v_mov_b32_e32 v21, s25 -; SDAG-NEXT: v_mov_b32_e32 v22, s26 -; SDAG-NEXT: v_mov_b32_e32 v23, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s24 +; SDAG-NEXT: v_mov_b32_e32 v5, s25 +; SDAG-NEXT: v_mov_b32_e32 v6, s26 +; SDAG-NEXT: v_mov_b32_e32 v7, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 10 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: @@ -4913,26 +5074,34 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] -; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 10 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: @@ -4940,32 +5109,40 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v16, s20 -; HEURRC-NEXT: v_mov_b32_e32 v17, s21 -; HEURRC-NEXT: v_mov_b32_e32 v18, s22 -; HEURRC-NEXT: v_mov_b32_e32 v19, s23 +; HEURRC-NEXT: v_mov_b32_e32 v0, s20 +; HEURRC-NEXT: v_mov_b32_e32 v1, s21 +; HEURRC-NEXT: v_mov_b32_e32 v2, s22 +; HEURRC-NEXT: v_mov_b32_e32 v3, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v20, s24 -; HEURRC-NEXT: v_mov_b32_e32 v21, s25 -; HEURRC-NEXT: v_mov_b32_e32 v22, s26 -; HEURRC-NEXT: v_mov_b32_e32 v23, s27 +; HEURRC-NEXT: v_mov_b32_e32 v4, s24 +; HEURRC-NEXT: v_mov_b32_e32 v5, s25 +; HEURRC-NEXT: v_mov_b32_e32 v6, s26 +; HEURRC-NEXT: v_mov_b32_e32 v7, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] -; HEURRC-NEXT: v_mov_b32_e32 v16, 0 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_nop 10 -; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: @@ -5085,32 +5262,40 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v20, s24 -; SDAG-NEXT: v_mov_b32_e32 v21, s25 -; SDAG-NEXT: v_mov_b32_e32 v22, s26 -; SDAG-NEXT: v_mov_b32_e32 v23, s27 +; SDAG-NEXT: v_mov_b32_e32 v4, s24 +; SDAG-NEXT: v_mov_b32_e32 v5, s25 +; SDAG-NEXT: v_mov_b32_e32 v6, s26 +; SDAG-NEXT: v_mov_b32_e32 v7, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 10 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: @@ -5119,26 +5304,34 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 10 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: @@ -5146,32 +5339,40 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v16, s20 -; HEURRC-NEXT: v_mov_b32_e32 v17, s21 -; HEURRC-NEXT: v_mov_b32_e32 v18, s22 -; HEURRC-NEXT: v_mov_b32_e32 v19, s23 +; HEURRC-NEXT: v_mov_b32_e32 v0, s20 +; HEURRC-NEXT: v_mov_b32_e32 v1, s21 +; HEURRC-NEXT: v_mov_b32_e32 v2, s22 +; HEURRC-NEXT: v_mov_b32_e32 v3, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v20, s24 -; HEURRC-NEXT: v_mov_b32_e32 v21, s25 -; HEURRC-NEXT: v_mov_b32_e32 v22, s26 -; HEURRC-NEXT: v_mov_b32_e32 v23, s27 +; HEURRC-NEXT: v_mov_b32_e32 v4, s24 +; HEURRC-NEXT: v_mov_b32_e32 v5, s25 +; HEURRC-NEXT: v_mov_b32_e32 v6, s26 +; HEURRC-NEXT: v_mov_b32_e32 v7, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 -; HEURRC-NEXT: v_mov_b32_e32 v16, 0 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_nop 10 -; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: @@ -5421,18 +5622,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[14:15] +; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_accvgpr_write_b32 a1, s1 +; GCN-NEXT: v_accvgpr_write_b32 a2, s2 +; GCN-NEXT: v_accvgpr_write_b32 a3, s3 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[2:5], v[6:9], a[0:3] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: @@ -5440,18 +5643,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[14:15] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[2:5], v[6:9], a[0:3] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: @@ -5464,8 +5669,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] @@ -5521,18 +5726,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[14:15] +; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_accvgpr_write_b32 a1, s1 +; GCN-NEXT: v_accvgpr_write_b32 a2, s2 +; GCN-NEXT: v_accvgpr_write_b32 a3, s3 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: @@ -5540,18 +5747,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v4, 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] -; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] -; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[14:15] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: @@ -5564,8 +5773,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[8:9] ; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[12:13] -; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 @@ -5615,5 +5824,5 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ret void } -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } attributes #1 = { "amdgpu-flat-work-group-size"="1,64" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index e7d7f87e4fc4c..a934f0e9c6770 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -1792,9 +1792,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: v_mov_b32_e32 v1, s21 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v2, s22 ; GFX90A-NEXT: v_mov_b32_e32 v3, s23 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 @@ -1829,9 +1829,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v0, s20 ; GFX942-NEXT: v_mov_b32_e32 v1, s21 -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_mov_b32_e32 v2, s22 ; GFX942-NEXT: v_mov_b32_e32 v3, s23 +; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 @@ -1866,9 +1866,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s20 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s21 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s22 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s23 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -1968,9 +1968,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NEXT: v_mov_b32_e32 v3, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX90A-NEXT: v_mov_b32_e32 v4, s6 ; GFX90A-NEXT: v_mov_b32_e32 v5, s7 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11 @@ -1990,9 +1990,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s4 ; GFX942-NEXT: v_mov_b32_e32 v3, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX942-NEXT: v_mov_b32_e32 v4, s6 ; GFX942-NEXT: v_mov_b32_e32 v5, s7 +; GFX942-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s9 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s10 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s11 @@ -2012,9 +2012,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s4 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s5 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s6 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s7 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 @@ -2181,9 +2181,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: v_mov_b32_e32 v1, s21 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v2, s22 ; GFX90A-NEXT: v_mov_b32_e32 v3, s23 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 @@ -2219,9 +2219,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v0, s20 ; GFX942-NEXT: v_mov_b32_e32 v1, s21 -; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_mov_b32_e32 v2, s22 ; GFX942-NEXT: v_mov_b32_e32 v3, s23 +; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 @@ -2256,9 +2256,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s20 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s21 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s22 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s23 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -2358,9 +2358,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NEXT: v_mov_b32_e32 v3, s5 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX90A-NEXT: v_mov_b32_e32 v4, s6 ; GFX90A-NEXT: v_mov_b32_e32 v5, s7 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11 @@ -2380,9 +2380,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s4 ; GFX942-NEXT: v_mov_b32_e32 v3, s5 -; GFX942-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX942-NEXT: v_mov_b32_e32 v4, s6 ; GFX942-NEXT: v_mov_b32_e32 v5, s7 +; GFX942-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s9 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s10 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s11 @@ -2402,9 +2402,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s4 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s5 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s6 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s7 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 @@ -3328,11 +3328,11 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_imm_src2_1(ptr addrspace( ; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1 -; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_nop 0 @@ -4335,11 +4335,11 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(ptr addrspace(1) %ar ; NOLIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_imm_splat: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 -; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_nop 0 @@ -4938,11 +4938,11 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0 ; NOLIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_imm: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 2.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 -; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_nop 0 @@ -4960,11 +4960,11 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0 ; LIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_imm: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 2.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 -; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: s_nop 0 @@ -5575,8 +5575,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, v1 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[32:33], v[30:31] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[32:33], v[30:31] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[30:31], v[28:29] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[28:29], v[26:27] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[26:27], v[24:25] @@ -5619,11 +5619,11 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 0x42f60000 ; NOLIT-SRCC-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v1 -; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] @@ -5642,11 +5642,11 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 0x42f60000 ; LIT-SRCC-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 +; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v1 -; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] @@ -5844,40 +5844,47 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg ; NOLIT-SRCC-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 ; NOLIT-SRCC-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 ; NOLIT-SRCC-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(7) +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v28 +; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(6) +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v24 +; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(5) +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v20 +; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(4) +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v16 +; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(3) +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v12 +; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(2) +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v8 +; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(1) +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 ; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(0) ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v5 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v6 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v7 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v8 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v9 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v10 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v11 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v12 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v13 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v14 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v15 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v16 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v17 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v18 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v19 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v20 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v21 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v22 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v23 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v24 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v25 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v26 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v27 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v28 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v29 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v30 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v31 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3 @@ -5939,40 +5946,47 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg ; LIT-SRCC-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 ; LIT-SRCC-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 ; LIT-SRCC-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; LIT-SRCC-NEXT: s_waitcnt vmcnt(7) +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v28 +; LIT-SRCC-NEXT: s_waitcnt vmcnt(6) +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v24 +; LIT-SRCC-NEXT: s_waitcnt vmcnt(5) +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v20 +; LIT-SRCC-NEXT: s_waitcnt vmcnt(4) +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v16 +; LIT-SRCC-NEXT: s_waitcnt vmcnt(3) +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v12 +; LIT-SRCC-NEXT: s_waitcnt vmcnt(2) +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v8 +; LIT-SRCC-NEXT: s_waitcnt vmcnt(1) +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 ; LIT-SRCC-NEXT: s_waitcnt vmcnt(0) ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v5 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v6 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v7 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v8 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v9 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v10 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v11 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v12 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v13 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v14 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v15 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v16 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v17 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v18 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v19 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v20 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v21 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v22 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v23 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v24 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v25 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v26 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v27 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v28 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v29 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v30 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v31 -; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index aae14c8cc87b3..779bbd0e4d1bf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,SDAG %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s + ; 0 = fp8 ; 1 = bf8 ; 2 = fp6 @@ -1871,36 +1872,36 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v4, s8 +; SDAG-NEXT: v_mov_b32_e32 v5, s9 +; SDAG-NEXT: v_mov_b32_e32 v6, s10 +; SDAG-NEXT: v_mov_b32_e32 v7, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s12 +; SDAG-NEXT: v_mov_b32_e32 v9, s13 +; SDAG-NEXT: v_mov_b32_e32 v10, s14 +; SDAG-NEXT: v_mov_b32_e32 v11, s15 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b32_e32 v12, s16 +; SDAG-NEXT: v_mov_b32_e32 v13, s17 +; SDAG-NEXT: v_mov_b32_e32 v14, s18 +; SDAG-NEXT: v_mov_b32_e32 v15, s19 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: v_mov_b32_e32 v21, s12 -; SDAG-NEXT: v_mov_b32_e32 v22, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_mov_b32_e32 v1, s12 +; SDAG-NEXT: v_mov_b32_e32 v2, s13 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v22 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v1, v2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[14:15] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[14:15] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd: @@ -1913,18 +1914,20 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] -; GISEL-NEXT: v_mov_b32_e32 v20, s28 -; GISEL-NEXT: v_mov_b32_e32 v21, s29 +; GISEL-NEXT: v_accvgpr_write_b32 a0, s24 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s25 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s26 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s27 +; GISEL-NEXT: v_mov_b32_e32 v16, s28 +; GISEL-NEXT: v_mov_b32_e32 v17, s29 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 10 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[30:31] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -1937,32 +1940,34 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v21, -2 -; SDAG-NEXT: v_mov_b32_e32 v22, 0x41 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, -2 +; SDAG-NEXT: v_mov_b32_e32 v2, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; SDAG-NEXT: v_mov_b32_e32 v4, s8 +; SDAG-NEXT: v_mov_b32_e32 v5, s9 +; SDAG-NEXT: v_mov_b32_e32 v6, s10 +; SDAG-NEXT: v_mov_b32_e32 v7, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s12 +; SDAG-NEXT: v_mov_b32_e32 v9, s13 +; SDAG-NEXT: v_mov_b32_e32 v10, s14 +; SDAG-NEXT: v_mov_b32_e32 v11, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s16 +; SDAG-NEXT: v_mov_b32_e32 v13, s17 +; SDAG-NEXT: v_mov_b32_e32 v14, s18 +; SDAG-NEXT: v_mov_b32_e32 v15, s19 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v1 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: @@ -1970,24 +1975,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 -; GISEL-NEXT: v_mov_b32_e32 v20, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v21, -2 +; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v17, -2 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 10 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2000,32 +2007,34 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v21, 1.0 -; SDAG-NEXT: v_mov_b32_e32 v22, 0x41 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; SDAG-NEXT: v_mov_b32_e32 v4, s8 +; SDAG-NEXT: v_mov_b32_e32 v5, s9 +; SDAG-NEXT: v_mov_b32_e32 v6, s10 +; SDAG-NEXT: v_mov_b32_e32 v7, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s12 +; SDAG-NEXT: v_mov_b32_e32 v9, s13 +; SDAG-NEXT: v_mov_b32_e32 v10, s14 +; SDAG-NEXT: v_mov_b32_e32 v11, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s16 +; SDAG-NEXT: v_mov_b32_e32 v13, s17 +; SDAG-NEXT: v_mov_b32_e32 v14, s18 +; SDAG-NEXT: v_mov_b32_e32 v15, s19 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v1 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal: @@ -2033,24 +2042,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 -; GISEL-NEXT: v_mov_b32_e32 v20, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v21, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v17, 1.0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 10 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2063,32 +2074,34 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v21, -2 -; SDAG-NEXT: v_mov_b32_e32 v22, 1.0 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, -2 +; SDAG-NEXT: v_mov_b32_e32 v2, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; SDAG-NEXT: v_mov_b32_e32 v4, s8 +; SDAG-NEXT: v_mov_b32_e32 v5, s9 +; SDAG-NEXT: v_mov_b32_e32 v6, s10 +; SDAG-NEXT: v_mov_b32_e32 v7, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s12 +; SDAG-NEXT: v_mov_b32_e32 v9, s13 +; SDAG-NEXT: v_mov_b32_e32 v10, s14 +; SDAG-NEXT: v_mov_b32_e32 v11, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s16 +; SDAG-NEXT: v_mov_b32_e32 v13, s17 +; SDAG-NEXT: v_mov_b32_e32 v14, s18 +; SDAG-NEXT: v_mov_b32_e32 v15, s19 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v1 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm: @@ -2096,24 +2109,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 -; GISEL-NEXT: v_mov_b32_e32 v20, 1.0 -; GISEL-NEXT: v_mov_b32_e32 v21, -2 +; GISEL-NEXT: v_mov_b32_e32 v16, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v17, -2 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 10 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2126,32 +2141,34 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v21, 0.15915494 -; SDAG-NEXT: v_mov_b32_e32 v22, 1.0 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 0.15915494 +; SDAG-NEXT: v_mov_b32_e32 v2, 1.0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; SDAG-NEXT: v_mov_b32_e32 v4, s8 +; SDAG-NEXT: v_mov_b32_e32 v5, s9 +; SDAG-NEXT: v_mov_b32_e32 v6, s10 +; SDAG-NEXT: v_mov_b32_e32 v7, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s12 +; SDAG-NEXT: v_mov_b32_e32 v9, s13 +; SDAG-NEXT: v_mov_b32_e32 v10, s14 +; SDAG-NEXT: v_mov_b32_e32 v11, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s16 +; SDAG-NEXT: v_mov_b32_e32 v13, s17 +; SDAG-NEXT: v_mov_b32_e32 v14, s18 +; SDAG-NEXT: v_mov_b32_e32 v15, s19 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v1 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 11 -; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal: @@ -2159,24 +2176,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x50 -; GISEL-NEXT: v_mov_b32_e32 v20, 1.0 -; GISEL-NEXT: v_mov_b32_e32 v21, 0.15915494 +; GISEL-NEXT: v_mov_b32_e32 v16, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v17, 0.15915494 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 10 -; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2549,5 +2568,5 @@ declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index f0205a3a788ed..c342cd140100b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=0 < %s | FileCheck -check-prefixes=GCN,SDAG %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -global-isel=1 < %s | FileCheck -check-prefixes=GCN,GISEL %s + ; 0 = fp8 ; 1 = bf8 ; 2 = fp6 @@ -4600,41 +4601,49 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: v_mov_b32_e32 v20, s12 -; SDAG-NEXT: v_mov_b32_e32 v21, s13 -; SDAG-NEXT: v_mov_b32_e32 v22, s14 -; SDAG-NEXT: v_mov_b32_e32 v23, s15 -; SDAG-NEXT: v_mov_b32_e32 v24, s16 -; SDAG-NEXT: v_mov_b32_e32 v25, s17 -; SDAG-NEXT: v_mov_b32_e32 v26, s18 -; SDAG-NEXT: v_mov_b32_e32 v27, s19 -; SDAG-NEXT: v_mov_b32_e32 v28, s20 -; SDAG-NEXT: v_mov_b32_e32 v29, s21 -; SDAG-NEXT: v_mov_b32_e32 v30, s22 -; SDAG-NEXT: v_mov_b32_e32 v31, s23 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; SDAG-NEXT: v_mov_b32_e32 v32, s0 -; SDAG-NEXT: v_mov_b32_e32 v33, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_mov_b32_e32 v10, s16 +; SDAG-NEXT: v_mov_b32_e32 v11, s17 +; SDAG-NEXT: v_mov_b32_e32 v12, s18 +; SDAG-NEXT: v_mov_b32_e32 v13, s19 +; SDAG-NEXT: v_mov_b32_e32 v14, s20 +; SDAG-NEXT: v_mov_b32_e32 v15, s21 +; SDAG-NEXT: v_mov_b32_e32 v16, s22 +; SDAG-NEXT: v_mov_b32_e32 v17, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s37 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s38 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s39 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s40 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s41 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s42 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s43 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s44 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s45 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s46 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s47 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s48 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s49 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 +; SDAG-NEXT: v_mov_b32_e32 v0, s0 +; SDAG-NEXT: v_mov_b32_e32 v1, s1 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v0, v1 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: @@ -4643,33 +4652,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; GISEL-NEXT: v_mov_b32_e32 v32, s0 -; GISEL-NEXT: v_mov_b32_e32 v33, s1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s36 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s37 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s38 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s39 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s40 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s41 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s42 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s43 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s44 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s45 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s46 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s47 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s48 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s49 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s50 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s51 +; GISEL-NEXT: v_mov_b32_e32 v16, s0 +; GISEL-NEXT: v_mov_b32_e32 v17, s1 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] +; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 +; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 +; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) store <16 x float> %result, ptr addrspace(1) %ptr, align 64 @@ -4681,78 +4698,94 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v32, -2 -; SDAG-NEXT: v_mov_b32_e32 v33, 0x41 +; SDAG-NEXT: v_mov_b32_e32 v0, -2 +; SDAG-NEXT: v_mov_b32_e32 v1, 0x41 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: v_mov_b32_e32 v20, s12 -; SDAG-NEXT: v_mov_b32_e32 v21, s13 -; SDAG-NEXT: v_mov_b32_e32 v22, s14 -; SDAG-NEXT: v_mov_b32_e32 v23, s15 -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; SDAG-NEXT: v_mov_b32_e32 v24, s16 -; SDAG-NEXT: v_mov_b32_e32 v25, s17 -; SDAG-NEXT: v_mov_b32_e32 v26, s18 -; SDAG-NEXT: v_mov_b32_e32 v27, s19 -; SDAG-NEXT: v_mov_b32_e32 v28, s20 -; SDAG-NEXT: v_mov_b32_e32 v29, s21 -; SDAG-NEXT: v_mov_b32_e32 v30, s22 -; SDAG-NEXT: v_mov_b32_e32 v31, s23 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_mov_b32_e32 v10, s16 +; SDAG-NEXT: v_mov_b32_e32 v11, s17 +; SDAG-NEXT: v_mov_b32_e32 v12, s18 +; SDAG-NEXT: v_mov_b32_e32 v13, s19 +; SDAG-NEXT: v_mov_b32_e32 v14, s20 +; SDAG-NEXT: v_mov_b32_e32 v15, s21 +; SDAG-NEXT: v_mov_b32_e32 v16, s22 +; SDAG-NEXT: v_mov_b32_e32 v17, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s37 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s38 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s39 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s40 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s41 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s42 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s43 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s44 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s45 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s46 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s47 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s48 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s49 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v1, v0 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v32, 0x41 -; GISEL-NEXT: v_mov_b32_e32 v33, -2 +; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v17, -2 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s36 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s37 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s38 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s39 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s40 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s41 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s42 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s43 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s44 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s45 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s46 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s47 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s48 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s49 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s50 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s51 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2) store <16 x float> %result, ptr addrspace(1) %ptr, align 64 @@ -4781,9 +4814,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; SDAG-NEXT: v_mov_b32_e32 v14, s24 ; SDAG-NEXT: v_mov_b32_e32 v15, s25 ; SDAG-NEXT: v_mov_b32_e32 v16, s26 +; SDAG-NEXT: v_mov_b32_e32 v17, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s27 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 @@ -4854,10 +4887,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -4880,12 +4913,12 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) @@ -5005,10 +5038,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 @@ -5029,12 +5062,12 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0] blgp:2 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) @@ -5065,71 +5098,77 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v32, s12 -; SDAG-NEXT: v_mov_b32_e32 v33, s13 -; SDAG-NEXT: v_mov_b32_e32 v34, s14 -; SDAG-NEXT: v_mov_b32_e32 v35, s15 -; SDAG-NEXT: v_mov_b32_e32 v36, s16 -; SDAG-NEXT: v_mov_b32_e32 v37, s17 -; SDAG-NEXT: v_mov_b32_e32 v38, s18 -; SDAG-NEXT: v_mov_b32_e32 v39, s19 -; SDAG-NEXT: v_mov_b32_e32 v40, s20 -; SDAG-NEXT: v_mov_b32_e32 v41, s21 -; SDAG-NEXT: v_mov_b32_e32 v42, s22 -; SDAG-NEXT: v_mov_b32_e32 v43, s23 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mov_b32_e32 v10, s22 +; SDAG-NEXT: v_mov_b32_e32 v11, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v44, s24 -; SDAG-NEXT: v_mov_b32_e32 v45, s25 -; SDAG-NEXT: v_mov_b32_e32 v46, s26 -; SDAG-NEXT: v_mov_b32_e32 v47, s27 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v13, s25 +; SDAG-NEXT: v_mov_b32_e32 v14, s26 +; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 -; SDAG-NEXT: s_nop 14 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 +; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 -; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 +; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v10, s10 +; SDAG-NEXT: v_mov_b32_e32 v11, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s8 +; SDAG-NEXT: v_mov_b32_e32 v9, s9 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5137,45 +5176,61 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 -; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 -; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 -; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_accvgpr_write_b32 a31, s23 +; GISEL-NEXT: v_accvgpr_write_b32 a30, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a29, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a28, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a27, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a26, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a25, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a24, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a23, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a22, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a21, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a20, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a19, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a18, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a17, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a16, s8 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0) @@ -5188,73 +5243,80 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 -; SDAG-NEXT: v_mov_b32_e32 v32, 42 -; SDAG-NEXT: v_mov_b32_e32 v33, 25 +; SDAG-NEXT: v_mov_b32_e32 v0, 42 +; SDAG-NEXT: v_mov_b32_e32 v1, 25 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s16 -; SDAG-NEXT: v_mov_b32_e32 v21, s17 -; SDAG-NEXT: v_mov_b32_e32 v22, s18 -; SDAG-NEXT: v_mov_b32_e32 v23, s19 -; SDAG-NEXT: v_mov_b32_e32 v24, s20 -; SDAG-NEXT: v_mov_b32_e32 v25, s21 -; SDAG-NEXT: v_mov_b32_e32 v26, s22 -; SDAG-NEXT: v_mov_b32_e32 v27, s23 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s16 +; SDAG-NEXT: v_mov_b32_e32 v7, s17 +; SDAG-NEXT: v_mov_b32_e32 v8, s18 +; SDAG-NEXT: v_mov_b32_e32 v9, s19 +; SDAG-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-NEXT: v_mov_b32_e32 v13, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v28, s24 -; SDAG-NEXT: v_mov_b32_e32 v29, s25 -; SDAG-NEXT: v_mov_b32_e32 v30, s26 -; SDAG-NEXT: v_mov_b32_e32 v31, s27 +; SDAG-NEXT: v_mov_b32_e32 v14, s24 +; SDAG-NEXT: v_mov_b32_e32 v15, s25 +; SDAG-NEXT: v_mov_b32_e32 v16, s26 +; SDAG-NEXT: v_mov_b32_e32 v17, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v1, v0 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 +; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 -; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 +; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v16, s8 -; SDAG-NEXT: v_mov_b32_e32 v17, s9 -; SDAG-NEXT: v_mov_b32_e32 v18, s10 -; SDAG-NEXT: v_mov_b32_e32 v19, s11 -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v10, s10 +; SDAG-NEXT: v_mov_b32_e32 v11, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s8 +; SDAG-NEXT: v_mov_b32_e32 v9, s9 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5262,54 +5324,62 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v32, 25 -; GISEL-NEXT: v_mov_b32_e32 v33, 42 -; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 +; GISEL-NEXT: v_mov_b32_e32 v20, 25 +; GISEL-NEXT: v_mov_b32_e32 v21, 42 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0] blgp:2 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 -; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel_hi:[0,0,0] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) @@ -6370,6 +6440,6 @@ declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } attributes #1 = { "amdgpu-flat-work-group-size"="128,128" } attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll index dbe95a8091932..e36b2181bf5c0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll @@ -124,8 +124,8 @@ define amdgpu_ps void @test_call(ptr addrspace(1) inreg %ptr) { ; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 ; GFX9-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-SDAG-NEXT: s_mov_b32 s6, src_pops_exiting_wave_id -; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-SDAG-NEXT: s_mov_b64 s[8:9], 36 +; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-SDAG-NEXT: s_mov_b32 s32, 0 @@ -155,31 +155,51 @@ define amdgpu_ps void @test_call(ptr addrspace(1) inreg %ptr) { ; GFX9-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm ; -; GFX10-LABEL: test_call: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX10-NEXT: s_mov_b32 s38, -1 -; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s2 -; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 -; GFX10-NEXT: s_mov_b64 s[8:9], 36 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-NEXT: s_mov_b32 s0, src_pops_exiting_wave_id -; GFX10-NEXT: s_mov_b32 s32, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: s_endpgm +; GFX10-SDAG-LABEL: test_call: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX10-SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX10-SDAG-NEXT: s_mov_b32 s38, -1 +; GFX10-SDAG-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX10-SDAG-NEXT: s_add_u32 s36, s36, s2 +; GFX10-SDAG-NEXT: s_addc_u32 s37, s37, 0 +; GFX10-SDAG-NEXT: s_getpc_b64 s[0:1] +; GFX10-SDAG-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX10-SDAG-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX10-SDAG-NEXT: s_mov_b64 s[8:9], 36 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-SDAG-NEXT: s_mov_b32 s0, src_pops_exiting_wave_id +; GFX10-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX10-SDAG-NEXT: s_mov_b32 s32, 0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: test_call: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX10-GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX10-GISEL-NEXT: s_mov_b32 s38, -1 +; GFX10-GISEL-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX10-GISEL-NEXT: s_add_u32 s36, s36, s2 +; GFX10-GISEL-NEXT: s_addc_u32 s37, s37, 0 +; GFX10-GISEL-NEXT: s_getpc_b64 s[0:1] +; GFX10-GISEL-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX10-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-GISEL-NEXT: s_mov_b32 s0, src_pops_exiting_wave_id +; GFX10-GISEL-NEXT: s_mov_b64 s[8:9], 36 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX10-GISEL-NEXT: s_mov_b32 s32, 0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-GISEL-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.pops.exiting.wave.id() call void @foo(i32 %id) ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll index b9bf76c1423b6..22db8d504f416 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll @@ -244,9 +244,9 @@ define amdgpu_kernel void @test_scc_quadmask_64(i32 %val0, i64 %val1, ptr addrsp ; GFX11-SDAG-NEXT: s_and_b32 s4, s6, 1 ; GFX11-SDAG-NEXT: s_quadmask_b64 s[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_cmp_eq_u32 s4, 0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-SDAG-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 ; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index d1ba892d7f7e1..95ebb856b7aee 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -446,9 +446,9 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -456,14 +456,14 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -480,9 +480,9 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -493,12 +493,12 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -517,9 +517,9 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) { ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm @@ -533,9 +533,9 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) { ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -571,9 +571,9 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -593,10 +593,10 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -605,15 +605,15 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -633,10 +633,10 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -645,15 +645,15 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index 0795f4050b622..d64a0bedc57b6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -global-isel -global-isel-abort=2 < %s | FileCheck --check-prefix=CHECK-GISEL -enable-var-scope %s + declare i32 @llvm.amdgcn.readlane.i32(i32, i32) #0 declare i64 @llvm.amdgcn.readlane.i64(i64, i32) #0 declare double @llvm.amdgcn.readlane.f64(double, i32) #0 @@ -214,9 +215,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 ; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -228,9 +229,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 32 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -247,9 +248,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 ; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -261,9 +262,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -312,8 +313,8 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s2, v1 ; CHECK-GISEL-NEXT: s_nop 3 ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -366,12 +367,12 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s3, v2 ; CHECK-GISEL-NEXT: s_nop 3 ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s3 ; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -426,12 +427,12 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: v_readfirstlane_b32 s3, v2 ; CHECK-GISEL-NEXT: s_nop 3 ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s3 ; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -458,9 +459,9 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm @@ -474,9 +475,9 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -490,15 +491,15 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 ; CHECK-SDAG-NEXT: v_readlane_b32 s2, v0, 32 ; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 @@ -507,16 +508,16 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v0 ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -536,12 +537,12 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32 ; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32 -; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -549,18 +550,18 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 -; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -580,12 +581,12 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32 ; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32 -; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -593,18 +594,18 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_f64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 -; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -640,9 +641,9 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -657,15 +658,15 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CHECK-SDAG-NEXT: s_endpgm @@ -674,15 +675,15 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -697,15 +698,15 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CHECK-SDAG-NEXT: s_endpgm @@ -714,15 +715,15 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll index e58bf6280a1f2..0df585ea2cc58 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll @@ -36,8 +36,8 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -234,9 +234,9 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -1022,8 +1022,8 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8GISEL-NEXT: s_mul_i32 s3, s3, s5 ; GFX8GISEL-NEXT: s_add_u32 s5, s2, s3 ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm @@ -1167,8 +1167,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s3, s4 ; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s4 ; GFX1132DAGISEL-NEXT: s_add_u32 s3, s5, s3 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1132DAGISEL-NEXT: s_endpgm ; @@ -1199,8 +1200,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12DAGISEL-NEXT: s_mul_i32 s3, s3, s4 ; GFX12DAGISEL-NEXT: s_mul_i32 s2, s2, s4 ; GFX12DAGISEL-NEXT: s_add_co_u32 s3, s5, s3 -; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12DAGISEL-NEXT: s_endpgm entry: @@ -1559,8 +1561,8 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: .LBB5_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll index f39dd867f9580..bbea318026ce6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll @@ -158,9 +158,9 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -818,9 +818,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3 ; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8DAGISEL-NEXT: s_endpgm @@ -830,8 +830,8 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm @@ -899,9 +899,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX1132DAGISEL-LABEL: uniform_value_i64: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1132DAGISEL-NEXT: s_endpgm ; @@ -1204,8 +1204,8 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: .LBB5_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll index 5d408dc65d68b..0e492c3b3f1d0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll @@ -41,8 +41,8 @@ define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX8GISEL-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll index 29dfb0b504f81..9213b0b59fa06 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll @@ -41,8 +41,8 @@ define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mul_f32_e64 v0, -s2, v0 ; GFX8GISEL-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll index 6f299ab8bb9cf..5488b123f0f12 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll @@ -158,9 +158,9 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -818,9 +818,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3 ; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8DAGISEL-NEXT: s_endpgm @@ -830,8 +830,8 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm @@ -899,9 +899,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX1132DAGISEL-LABEL: uniform_value_i64: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1132DAGISEL-NEXT: s_endpgm ; @@ -1262,8 +1262,8 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: .LBB5_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll index 3c4cbc74aedc1..65512fd382b09 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll @@ -158,9 +158,9 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -818,9 +818,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3 ; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8DAGISEL-NEXT: s_endpgm @@ -830,8 +830,8 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm @@ -899,9 +899,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX1132DAGISEL-LABEL: uniform_value_i64: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1132DAGISEL-NEXT: s_endpgm ; @@ -1262,8 +1262,8 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: .LBB5_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll index d6ccf7ce2831d..40fa80ff823b6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll @@ -158,9 +158,9 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -818,9 +818,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3 ; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8DAGISEL-NEXT: s_endpgm @@ -830,8 +830,8 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm @@ -899,9 +899,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX1132DAGISEL-LABEL: uniform_value_i64: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1132DAGISEL-NEXT: s_endpgm ; @@ -1205,8 +1205,8 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: .LBB5_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll index f094213731684..84194714b95c7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll @@ -38,8 +38,8 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_sub_i32 s3, 0, s6 ; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -249,9 +249,9 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -1058,8 +1058,8 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8GISEL-NEXT: s_add_u32 s2, s2, s3 ; GFX8GISEL-NEXT: s_add_u32 s5, s2, s6 ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm @@ -1078,8 +1078,8 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9DAGISEL-NEXT: s_mul_i32 s3, s3, s5 ; GFX9DAGISEL-NEXT: s_add_u32 s2, s2, s3 ; GFX9DAGISEL-NEXT: s_add_u32 s5, s2, s6 -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9DAGISEL-NEXT: s_endpgm @@ -1242,8 +1242,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX1132DAGISEL-NEXT: s_add_u32 s3, s6, s3 ; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s4 ; GFX1132DAGISEL-NEXT: s_add_u32 s3, s3, s5 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1132DAGISEL-NEXT: s_endpgm ; @@ -1284,8 +1285,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX12DAGISEL-NEXT: s_add_co_u32 s3, s6, s3 ; GFX12DAGISEL-NEXT: s_mul_i32 s2, s2, s4 ; GFX12DAGISEL-NEXT: s_add_co_u32 s3, s3, s5 -; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX12DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12DAGISEL-NEXT: s_endpgm entry: @@ -1661,8 +1663,8 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: .LBB5_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll index 54c8e2e248f57..17c3ca584b5d1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll @@ -159,9 +159,9 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -819,9 +819,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3 ; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8DAGISEL-NEXT: s_endpgm @@ -831,8 +831,8 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm @@ -900,9 +900,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX1132DAGISEL-LABEL: uniform_value_i64: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1132DAGISEL-NEXT: s_endpgm ; @@ -1391,8 +1391,8 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: .LBB7_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll index 502ef84449751..ec2fb68273270 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll @@ -159,9 +159,9 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -819,9 +819,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3 ; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8DAGISEL-NEXT: s_endpgm @@ -831,8 +831,8 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm @@ -900,9 +900,9 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { ; GFX1132DAGISEL-LABEL: uniform_value_i64: ; GFX1132DAGISEL: ; %bb.0: ; %entry ; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1132DAGISEL-NEXT: s_endpgm ; @@ -1251,8 +1251,8 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: .LBB5_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll index d5f1750c268ab..9ed34b95908a7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s + define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: uniform_value: ; GFX8DAGISEL: ; %bb.0: ; %entry @@ -37,8 +38,8 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -200,19 +201,153 @@ entry: ret void } +define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: const_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: const_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: const_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: const_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: const_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: const_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: const_value: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: const_value: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: const_value: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: const_value: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 123, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: poison_value: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: poison_value: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: poison_value: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: poison_value: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: poison_value: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: poison_value: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_endpgm +; +; GFX11DAGISEL-LABEL: poison_value: +; GFX11DAGISEL: ; %bb.0: ; %entry +; GFX11DAGISEL-NEXT: s_endpgm +; +; GFX11GISEL-LABEL: poison_value: +; GFX11GISEL: ; %bb.0: ; %entry +; GFX11GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 poison, i32 1) + store i32 %result, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8DAGISEL-LABEL: divergent_value: ; GFX8DAGISEL: ; %bb.0: ; %entry ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0 -; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX8DAGISEL-NEXT: s_xor_b32 s4, s4, s6 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8DAGISEL-NEXT: ; %bb.2: ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -226,17 +361,17 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s4, 0 -; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX8GISEL-NEXT: s_xor_b32 s4, s4, s6 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GISEL-NEXT: s_endpgm @@ -247,13 +382,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0 -; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX9DAGISEL-NEXT: s_xor_b32 s4, s4, s6 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9DAGISEL-NEXT: ; %bb.2: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -265,13 +400,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX9GISEL-NEXT: s_xor_b32 s4, s4, s6 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9GISEL-NEXT: ; %bb.2: ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -285,13 +420,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0 -; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1064DAGISEL-NEXT: s_xor_b32 s4, s4, s6 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -303,13 +438,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s4, 0 -; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1064GISEL-NEXT: s_xor_b32 s4, s4, s6 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064GISEL-NEXT: ; %bb.2: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -323,13 +458,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0 -; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1032DAGISEL-NEXT: s_xor_b32 s2, s2, s5 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -341,13 +476,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s2, 0 -; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3 ; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1032GISEL-NEXT: s_xor_b32 s2, s2, s5 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032GISEL-NEXT: ; %bb.2: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -362,14 +497,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0 -; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1164DAGISEL-NEXT: s_xor_b32 s4, s4, s6 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -382,14 +517,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s4, 0 -; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5 ; GFX1164GISEL-NEXT: s_xor_b32 s4, s4, s6 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164GISEL-NEXT: ; %bb.2: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 @@ -403,14 +538,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0 -; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1132DAGISEL-NEXT: s_xor_b32 s2, s2, s5 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -423,14 +558,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { ; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s2, 0 -; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4 ; GFX1132GISEL-NEXT: s_xor_b32 s2, s2, s5 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132GISEL-NEXT: ; %bb.2: ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -450,7 +585,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else ; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -459,24 +594,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX8DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow ; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX8DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8DAGISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -491,7 +626,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -500,20 +635,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mul_i32 s6, s6, s2 -; GFX8GISEL-NEXT: .LBB2_2: ; %Flow +; GFX8GISEL-NEXT: .LBB4_2: ; %Flow ; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s6, 0 -; GFX8GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8GISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX8GISEL-NEXT: .LBB2_5: ; %endif +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX8GISEL-NEXT: .LBB4_5: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 @@ -529,7 +664,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else ; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -538,24 +673,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX9DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow ; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX9DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9DAGISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -569,7 +704,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -578,20 +713,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s2 -; GFX9GISEL-NEXT: .LBB2_2: ; %Flow +; GFX9GISEL-NEXT: .LBB4_2: ; %Flow ; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s6, 0 -; GFX9GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9GISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX9GISEL-NEXT: .LBB2_5: ; %endif +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX9GISEL-NEXT: .LBB4_5: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -606,7 +741,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -615,24 +750,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX1064DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow ; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX1064DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064DAGISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -646,7 +781,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -655,20 +790,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s2 -; GFX1064GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 -; GFX1064GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064GISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1064GISEL-NEXT: .LBB2_5: ; %endif +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1064GISEL-NEXT: .LBB4_5: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -683,7 +818,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 ; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo @@ -692,24 +827,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2 -; GFX1032DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow ; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 -; GFX1032DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032DAGISEL-NEXT: s_xor_b32 s1, s1, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -723,7 +858,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo @@ -732,20 +867,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2 -; GFX1032GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow ; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 -; GFX1032GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032GISEL-NEXT: s_xor_b32 s0, s0, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1032GISEL-NEXT: .LBB2_5: ; %endif +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1032GISEL-NEXT: .LBB4_5: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -762,7 +897,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -772,25 +907,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX1164DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow ; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX1164DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164DAGISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -806,7 +941,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -816,21 +951,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mul_i32 s6, s6, s2 -; GFX1164GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 -; GFX1164GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164GISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1164GISEL-NEXT: .LBB2_5: ; %endif +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1164GISEL-NEXT: .LBB4_5: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -847,7 +982,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo @@ -857,25 +992,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2 -; GFX1132DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow ; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 -; GFX1132DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_xor_b32 s1, s1, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -891,7 +1026,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo @@ -901,21 +1036,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2 -; GFX1132GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow ; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_5 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 -; GFX1132GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_xor_b32 s0, s0, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1132GISEL-NEXT: .LBB2_5: ; %endif +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4 +; GFX1132GISEL-NEXT: .LBB4_5: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 @@ -940,905 +1075,3 @@ endif: store i32 %combine, ptr addrspace(1) %out ret void } - -define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { -; GFX8DAGISEL-LABEL: uniform_value_i64: -; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8DAGISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX8DAGISEL-NEXT: s_mul_i32 s1, s3, s4 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8DAGISEL-NEXT: s_mul_i32 s0, s2, s4 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; GFX8DAGISEL-NEXT: s_endpgm -; -; GFX8GISEL-LABEL: uniform_value_i64: -; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8GISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX8GISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 -; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; GFX8GISEL-NEXT: s_endpgm -; -; GFX9DAGISEL-LABEL: uniform_value_i64: -; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9DAGISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9DAGISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX9DAGISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9DAGISEL-NEXT: s_endpgm -; -; GFX9GISEL-LABEL: uniform_value_i64: -; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9GISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9GISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX9GISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9GISEL-NEXT: s_endpgm -; -; GFX1064DAGISEL-LABEL: uniform_value_i64: -; GFX1064DAGISEL: ; %bb.0: ; %entry -; GFX1064DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064DAGISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1064DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX1064DAGISEL-NEXT: s_endpgm -; -; GFX1064GISEL-LABEL: uniform_value_i64: -; GFX1064GISEL: ; %bb.0: ; %entry -; GFX1064GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064GISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064GISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX1064GISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX1064GISEL-NEXT: s_endpgm -; -; GFX1032DAGISEL-LABEL: uniform_value_i64: -; GFX1032DAGISEL: ; %bb.0: ; %entry -; GFX1032DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4 -; GFX1032DAGISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX1032DAGISEL-NEXT: s_endpgm -; -; GFX1032GISEL-LABEL: uniform_value_i64: -; GFX1032GISEL: ; %bb.0: ; %entry -; GFX1032GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s4, s4 -; GFX1032GISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX1032GISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX1032GISEL-NEXT: s_endpgm -; -; GFX1164DAGISEL-LABEL: uniform_value_i64: -; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164DAGISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164DAGISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1164DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1164DAGISEL-NEXT: s_endpgm -; -; GFX1164GISEL-LABEL: uniform_value_i64: -; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164GISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX1164GISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1164GISEL-NEXT: s_endpgm -; -; GFX1132DAGISEL-LABEL: uniform_value_i64: -; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4 -; GFX1132DAGISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1132DAGISEL-NEXT: s_endpgm -; -; GFX1132GISEL-LABEL: uniform_value_i64: -; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132GISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s4, s4 -; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1132GISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s4 -; GFX1132GISEL-NEXT: s_mul_i32 s3, s3, s4 -; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1132GISEL-NEXT: s_endpgm -entry: - %result = call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 %in, i32 1) - store i64 %result, ptr addrspace(1) %out - ret void -} - -define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { -; GFX8DAGISEL-LABEL: divergent_value_i64: -; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] -; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s10 -; GFX8DAGISEL-NEXT: v_readlane_b32 s9, v3, s10 -; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s10 -; GFX8DAGISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX8DAGISEL-NEXT: ; %bb.2: -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX8GISEL-LABEL: divergent_value_i64: -; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8GISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX8GISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] -; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s10 -; GFX8GISEL-NEXT: v_readlane_b32 s9, v3, s10 -; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s10 -; GFX8GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX8GISEL-NEXT: ; %bb.2: -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX8GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9DAGISEL-LABEL: divergent_value_i64: -; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] -; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s10 -; GFX9DAGISEL-NEXT: v_readlane_b32 s9, v3, s10 -; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s10 -; GFX9DAGISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX9DAGISEL-NEXT: ; %bb.2: -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9GISEL-LABEL: divergent_value_i64: -; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9GISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX9GISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] -; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s10 -; GFX9GISEL-NEXT: v_readlane_b32 s9, v3, s10 -; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s10 -; GFX9GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX9GISEL-NEXT: ; %bb.2: -; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1064DAGISEL-LABEL: divergent_value_i64: -; GFX1064DAGISEL: ; %bb.0: ; %entry -; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] -; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s10 -; GFX1064DAGISEL-NEXT: v_readlane_b32 s9, v3, s10 -; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s10 -; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX1064DAGISEL-NEXT: ; %bb.2: -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX1064DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1064GISEL-LABEL: divergent_value_i64: -; GFX1064GISEL: ; %bb.0: ; %entry -; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX1064GISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] -; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s10 -; GFX1064GISEL-NEXT: v_readlane_b32 s9, v3, s10 -; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s10 -; GFX1064GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX1064GISEL-NEXT: ; %bb.2: -; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX1064GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1032DAGISEL-LABEL: divergent_value_i64: -; GFX1032DAGISEL: ; %bb.0: ; %entry -; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1032DAGISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6 -; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 -; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v3, s7 -; GFX1032DAGISEL-NEXT: s_bitset0_b32 s6, s7 -; GFX1032DAGISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX1032DAGISEL-NEXT: ; %bb.2: -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1032GISEL-LABEL: divergent_value_i64: -; GFX1032GISEL: ; %bb.0: ; %entry -; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], 0 -; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6 -; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s7 -; GFX1032GISEL-NEXT: v_readlane_b32 s9, v3, s7 -; GFX1032GISEL-NEXT: s_bitset0_b32 s6, s7 -; GFX1032GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] -; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX1032GISEL-NEXT: ; %bb.2: -; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4 -; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5 -; GFX1032GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1164DAGISEL-LABEL: divergent_value_i64: -; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s6, s[2:3] -; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s6 -; GFX1164DAGISEL-NEXT: v_readlane_b32 s5, v3, s6 -; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s6 -; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX1164DAGISEL-NEXT: ; %bb.2: -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1 -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX1164DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1164GISEL-LABEL: divergent_value_i64: -; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], 0 -; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164GISEL-NEXT: s_ctz_i32_b64 s6, s[2:3] -; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s6 -; GFX1164GISEL-NEXT: v_readlane_b32 s5, v3, s6 -; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s6 -; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX1164GISEL-NEXT: ; %bb.2: -; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1 -; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX1164GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1132DAGISEL-LABEL: divergent_value_i64: -; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1132DAGISEL-NEXT: s_mov_b64 s[0:1], 0 -; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s3 -; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v3, s3 -; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 -; GFX1132DAGISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX1132DAGISEL-NEXT: ; %bb.2: -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1132GISEL-LABEL: divergent_value_i64: -; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1132GISEL-NEXT: s_mov_b64 s[0:1], 0 -; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 -; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s3 -; GFX1132GISEL-NEXT: v_readlane_b32 s5, v3, s3 -; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 -; GFX1132GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX1132GISEL-NEXT: ; %bb.2: -; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31] -entry: - %result = call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 %id.x, i32 1) - store i64 %result, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 %in2) { -; GFX8DAGISEL-LABEL: divergent_cfg_i64: -; GFX8DAGISEL: ; %bb.0: ; %entry -; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8DAGISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX8DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX8DAGISEL-NEXT: ; %bb.1: ; %else -; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX8DAGISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: s_mul_i32 s6, s2, s7 -; GFX8DAGISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX8DAGISEL-NEXT: .LBB5_2: ; %Flow -; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX8DAGISEL-NEXT: ; %bb.3: ; %if -; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX8DAGISEL-NEXT: s_and_b32 s6, s6, 1 -; GFX8DAGISEL-NEXT: s_mul_i32 s4, s4, s6 -; GFX8DAGISEL-NEXT: s_mul_i32 s5, s5, s6 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX8DAGISEL-NEXT: ; %bb.4: ; %endif -; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 -; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; GFX8DAGISEL-NEXT: s_endpgm -; -; GFX8GISEL-LABEL: divergent_cfg_i64: -; GFX8GISEL: ; %bb.0: ; %entry -; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 -; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX8GISEL-NEXT: ; %bb.1: ; %else -; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX8GISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: s_mul_i32 s6, s2, s7 -; GFX8GISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX8GISEL-NEXT: .LBB5_2: ; %Flow -; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_4 -; GFX8GISEL-NEXT: ; %bb.3: ; %if -; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX8GISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8GISEL-NEXT: s_mul_i32 s6, s4, s7 -; GFX8GISEL-NEXT: s_mul_i32 s7, s5, s7 -; GFX8GISEL-NEXT: .LBB5_4: ; %endif -; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 -; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; GFX8GISEL-NEXT: s_endpgm -; -; GFX9DAGISEL-LABEL: divergent_cfg_i64: -; GFX9DAGISEL: ; %bb.0: ; %entry -; GFX9DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9DAGISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX9DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX9DAGISEL-NEXT: ; %bb.1: ; %else -; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9DAGISEL-NEXT: s_and_b32 s5, s4, 1 -; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9DAGISEL-NEXT: s_mul_i32 s4, s2, s5 -; GFX9DAGISEL-NEXT: s_mul_i32 s5, s3, s5 -; GFX9DAGISEL-NEXT: .LBB5_2: ; %Flow -; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX9DAGISEL-NEXT: ; %bb.3: ; %if -; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9DAGISEL-NEXT: s_and_b32 s5, s4, 1 -; GFX9DAGISEL-NEXT: s_mul_i32 s4, s6, s5 -; GFX9DAGISEL-NEXT: s_mul_i32 s5, s7, s5 -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9DAGISEL-NEXT: ; %bb.4: ; %endif -; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9DAGISEL-NEXT: s_endpgm -; -; GFX9GISEL-LABEL: divergent_cfg_i64: -; GFX9GISEL: ; %bb.0: ; %entry -; GFX9GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 -; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX9GISEL-NEXT: ; %bb.1: ; %else -; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX9GISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9GISEL-NEXT: s_mul_i32 s6, s2, s7 -; GFX9GISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX9GISEL-NEXT: .LBB5_2: ; %Flow -; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_4 -; GFX9GISEL-NEXT: ; %bb.3: ; %if -; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9GISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s4 -; GFX9GISEL-NEXT: s_mul_i32 s7, s7, s4 -; GFX9GISEL-NEXT: .LBB5_4: ; %endif -; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9GISEL-NEXT: s_endpgm -; -; GFX1064DAGISEL-LABEL: divergent_cfg_i64: -; GFX1064DAGISEL: ; %bb.0: ; %entry -; GFX1064DAGISEL-NEXT: s_clause 0x1 -; GFX1064DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 -; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064DAGISEL-NEXT: s_and_b32 s5, s4, 1 -; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064DAGISEL-NEXT: s_mul_i32 s4, s2, s5 -; GFX1064DAGISEL-NEXT: s_mul_i32 s5, s3, s5 -; GFX1064DAGISEL-NEXT: .LBB5_2: ; %Flow -; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if -; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064DAGISEL-NEXT: s_and_b32 s5, s4, 1 -; GFX1064DAGISEL-NEXT: s_mul_i32 s4, s6, s5 -; GFX1064DAGISEL-NEXT: s_mul_i32 s5, s7, s5 -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1064DAGISEL-NEXT: ; %bb.4: ; %endif -; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX1064DAGISEL-NEXT: s_endpgm -; -; GFX1064GISEL-LABEL: divergent_cfg_i64: -; GFX1064GISEL: ; %bb.0: ; %entry -; GFX1064GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 -; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX1064GISEL-NEXT: ; %bb.1: ; %else -; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064GISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064GISEL-NEXT: s_mul_i32 s6, s2, s7 -; GFX1064GISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow -; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_4 -; GFX1064GISEL-NEXT: ; %bb.3: ; %if -; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064GISEL-NEXT: s_and_b32 s4, s4, 1 -; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s4 -; GFX1064GISEL-NEXT: s_mul_i32 s7, s7, s4 -; GFX1064GISEL-NEXT: .LBB5_4: ; %endif -; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX1064GISEL-NEXT: s_endpgm -; -; GFX1032DAGISEL-LABEL: divergent_cfg_i64: -; GFX1032DAGISEL: ; %bb.0: ; %entry -; GFX1032DAGISEL-NEXT: s_clause 0x1 -; GFX1032DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 -; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1032DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4 -; GFX1032DAGISEL-NEXT: s_and_b32 s5, s4, 1 -; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032DAGISEL-NEXT: s_mul_i32 s4, s2, s5 -; GFX1032DAGISEL-NEXT: s_mul_i32 s5, s3, s5 -; GFX1032DAGISEL-NEXT: .LBB5_2: ; %Flow -; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s8 -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if -; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo -; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032DAGISEL-NEXT: s_and_b32 s3, s3, 1 -; GFX1032DAGISEL-NEXT: s_mul_i32 s4, s6, s3 -; GFX1032DAGISEL-NEXT: s_mul_i32 s5, s7, s3 -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1032DAGISEL-NEXT: ; %bb.4: ; %endif -; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX1032DAGISEL-NEXT: s_endpgm -; -; GFX1032GISEL-LABEL: divergent_cfg_i64: -; GFX1032GISEL: ; %bb.0: ; %entry -; GFX1032GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 -; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX1032GISEL-NEXT: ; %bb.1: ; %else -; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s6, s6 -; GFX1032GISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: s_mul_i32 s6, s2, s7 -; GFX1032GISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow -; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s2, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_4 -; GFX1032GISEL-NEXT: ; %bb.3: ; %if -; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo -; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032GISEL-NEXT: s_and_b32 s3, s3, 1 -; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032GISEL-NEXT: s_mul_i32 s6, s6, s3 -; GFX1032GISEL-NEXT: s_mul_i32 s7, s7, s3 -; GFX1032GISEL-NEXT: .LBB5_4: ; %endif -; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX1032GISEL-NEXT: s_endpgm -; -; GFX1164DAGISEL-LABEL: divergent_cfg_i64: -; GFX1164DAGISEL: ; %bb.0: ; %entry -; GFX1164DAGISEL-NEXT: s_clause 0x1 -; GFX1164DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164DAGISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164DAGISEL-NEXT: s_mov_b64 s[8:9], exec -; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1164DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1164DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164DAGISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: s_mul_i32 s6, s2, s7 -; GFX1164DAGISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX1164DAGISEL-NEXT: .LBB5_2: ; %Flow -; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164DAGISEL-NEXT: s_and_b32 s6, s6, 1 -; GFX1164DAGISEL-NEXT: s_mul_i32 s4, s4, s6 -; GFX1164DAGISEL-NEXT: s_mul_i32 s5, s5, s6 -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1164DAGISEL-NEXT: ; %bb.4: ; %endif -; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1164DAGISEL-NEXT: s_endpgm -; -; GFX1164GISEL-LABEL: divergent_cfg_i64: -; GFX1164GISEL: ; %bb.0: ; %entry -; GFX1164GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1164GISEL-NEXT: s_mov_b64 s[8:9], exec -; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX1164GISEL-NEXT: ; %bb.1: ; %else -; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164GISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: s_mul_i32 s6, s2, s7 -; GFX1164GISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow -; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_4 -; GFX1164GISEL-NEXT: ; %bb.3: ; %if -; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164GISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164GISEL-NEXT: s_mul_i32 s6, s4, s7 -; GFX1164GISEL-NEXT: s_mul_i32 s7, s5, s7 -; GFX1164GISEL-NEXT: .LBB5_4: ; %endif -; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1164GISEL-NEXT: s_endpgm -; -; GFX1132DAGISEL-LABEL: divergent_cfg_i64: -; GFX1132DAGISEL: ; %bb.0: ; %entry -; GFX1132DAGISEL-NEXT: s_clause 0x1 -; GFX1132DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132DAGISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132DAGISEL-NEXT: s_mov_b32 s8, exec_lo -; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 -; GFX1132DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else -; GFX1132DAGISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s6, s6 -; GFX1132DAGISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: s_mul_i32 s6, s2, s7 -; GFX1132DAGISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX1132DAGISEL-NEXT: .LBB5_2: ; %Flow -; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s8 -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo -; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132DAGISEL-NEXT: s_and_b32 s3, s3, 1 -; GFX1132DAGISEL-NEXT: s_mul_i32 s4, s4, s3 -; GFX1132DAGISEL-NEXT: s_mul_i32 s5, s5, s3 -; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1132DAGISEL-NEXT: ; %bb.4: ; %endif -; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1132DAGISEL-NEXT: s_endpgm -; -; GFX1132GISEL-LABEL: divergent_cfg_i64: -; GFX1132GISEL: ; %bb.0: ; %entry -; GFX1132GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1132GISEL-NEXT: s_mov_b32 s8, exec_lo -; GFX1132GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 -; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2 -; GFX1132GISEL-NEXT: ; %bb.1: ; %else -; GFX1132GISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s6, s6 -; GFX1132GISEL-NEXT: s_and_b32 s7, s6, 1 -; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: s_mul_i32 s6, s2, s7 -; GFX1132GISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow -; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s2, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_4 -; GFX1132GISEL-NEXT: ; %bb.3: ; %if -; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo -; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132GISEL-NEXT: s_and_b32 s3, s3, 1 -; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132GISEL-NEXT: s_mul_i32 s6, s4, s3 -; GFX1132GISEL-NEXT: s_mul_i32 s7, s5, s3 -; GFX1132GISEL-NEXT: .LBB5_4: ; %endif -; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 -; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1132GISEL-NEXT: s_endpgm -entry: - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %d_cmp = icmp ult i32 %tid, 16 - br i1 %d_cmp, label %if, label %else - -if: - %reducedValTid = call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 %in2, i32 1) - br label %endif - -else: - %reducedValIn = call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 %in, i32 1) - br label %endif - -endif: - %combine = phi i64 [%reducedValTid, %if], [%reducedValIn, %else] - store i64 %combine, ptr addrspace(1) %out - ret void -} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10DAGISEL: {{.*}} -; GFX10GISEL: {{.*}} -; GFX11DAGISEL: {{.*}} -; GFX11GISEL: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll index 20523476a29d5..07736f01b4166 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll @@ -12,43 +12,45 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; GCN-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 +; GCN-NEXT: ds_load_b128 v[0:3], v32 ; GCN-NEXT: ds_load_b128 v[4:7], v32 offset:16 ; GCN-NEXT: ds_load_b128 v[12:15], v32 offset:2064 -; GCN-NEXT: ds_load_b128 v[20:23], v32 offset:6160 -; GCN-NEXT: ds_load_b128 v[28:31], v32 offset:12304 -; GCN-NEXT: ds_load_b128 v[36:39], v32 offset:20496 -; GCN-NEXT: ds_load_b128 v[0:3], v32 ; GCN-NEXT: ds_load_b128 v[8:11], v32 offset:2048 +; GCN-NEXT: ds_load_b128 v[20:23], v32 offset:6160 ; GCN-NEXT: ds_load_b128 v[16:19], v32 offset:6144 +; GCN-NEXT: ds_load_b128 v[28:31], v32 offset:12304 ; GCN-NEXT: ds_load_b128 v[24:27], v32 offset:12288 +; GCN-NEXT: ds_load_b128 v[36:39], v32 offset:20496 ; GCN-NEXT: ds_load_b128 v[32:35], v32 offset:20480 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(10) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(4) -; GCN-NEXT: v_mov_b32_e32 v47, v7 -; GCN-NEXT: s_waitcnt lgkmcnt(3) -; GCN-NEXT: v_mov_b32_e32 v55, v15 -; GCN-NEXT: s_waitcnt lgkmcnt(2) -; GCN-NEXT: v_mov_b32_e32 v63, v23 -; GCN-NEXT: s_waitcnt lgkmcnt(1) -; GCN-NEXT: v_mov_b32_e32 v71, v31 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_dual_mov_b32 v79, v39 :: v_dual_mov_b32 v46, v6 +; GCN-NEXT: s_waitcnt lgkmcnt(9) +; GCN-NEXT: v_mov_b32_e32 v43, v3 +; GCN-NEXT: s_waitcnt lgkmcnt(8) +; GCN-NEXT: v_dual_mov_b32 v47, v7 :: v_dual_mov_b32 v46, v6 ; GCN-NEXT: v_dual_mov_b32 v45, v5 :: v_dual_mov_b32 v44, v4 -; GCN-NEXT: v_dual_mov_b32 v43, v3 :: v_dual_mov_b32 v42, v2 -; GCN-NEXT: v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0 +; GCN-NEXT: v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v41, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(7) +; GCN-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v55, v15 ; GCN-NEXT: v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v53, v13 +; GCN-NEXT: s_waitcnt lgkmcnt(6) ; GCN-NEXT: v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v51, v11 ; GCN-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v49, v9 -; GCN-NEXT: v_mov_b32_e32 v48, v8 +; GCN-NEXT: s_waitcnt lgkmcnt(5) +; GCN-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v63, v23 ; GCN-NEXT: v_dual_mov_b32 v62, v22 :: v_dual_mov_b32 v61, v21 +; GCN-NEXT: s_waitcnt lgkmcnt(4) ; GCN-NEXT: v_dual_mov_b32 v60, v20 :: v_dual_mov_b32 v59, v19 ; GCN-NEXT: v_dual_mov_b32 v58, v18 :: v_dual_mov_b32 v57, v17 -; GCN-NEXT: v_mov_b32_e32 v56, v16 +; GCN-NEXT: s_waitcnt lgkmcnt(3) +; GCN-NEXT: v_dual_mov_b32 v56, v16 :: v_dual_mov_b32 v71, v31 ; GCN-NEXT: v_dual_mov_b32 v70, v30 :: v_dual_mov_b32 v69, v29 +; GCN-NEXT: s_waitcnt lgkmcnt(2) ; GCN-NEXT: v_dual_mov_b32 v68, v28 :: v_dual_mov_b32 v67, v27 ; GCN-NEXT: v_dual_mov_b32 v66, v26 :: v_dual_mov_b32 v65, v25 -; GCN-NEXT: v_mov_b32_e32 v64, v24 +; GCN-NEXT: s_waitcnt lgkmcnt(1) +; GCN-NEXT: v_dual_mov_b32 v64, v24 :: v_dual_mov_b32 v79, v39 ; GCN-NEXT: v_dual_mov_b32 v78, v38 :: v_dual_mov_b32 v77, v37 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_dual_mov_b32 v76, v36 :: v_dual_mov_b32 v75, v35 ; GCN-NEXT: v_dual_mov_b32 v74, v34 :: v_dual_mov_b32 v73, v33 ; GCN-NEXT: v_mov_b32_e32 v72, v32 @@ -80,43 +82,45 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v32, s0, v40 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 +; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v32 ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v32 offset:16 ; EXACTCUTOFF-NEXT: ds_load_b128 v[12:15], v32 offset:2064 -; EXACTCUTOFF-NEXT: ds_load_b128 v[20:23], v32 offset:6160 -; EXACTCUTOFF-NEXT: ds_load_b128 v[28:31], v32 offset:12304 -; EXACTCUTOFF-NEXT: ds_load_b128 v[36:39], v32 offset:20496 -; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v32 ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v32 offset:2048 +; EXACTCUTOFF-NEXT: ds_load_b128 v[20:23], v32 offset:6160 ; EXACTCUTOFF-NEXT: ds_load_b128 v[16:19], v32 offset:6144 +; EXACTCUTOFF-NEXT: ds_load_b128 v[28:31], v32 offset:12304 ; EXACTCUTOFF-NEXT: ds_load_b128 v[24:27], v32 offset:12288 +; EXACTCUTOFF-NEXT: ds_load_b128 v[36:39], v32 offset:20496 ; EXACTCUTOFF-NEXT: ds_load_b128 v[32:35], v32 offset:20480 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(10) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(4) -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v47, v7 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(3) -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v55, v15 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(2) -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v63, v23 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v71, v31 -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v79, v39 :: v_dual_mov_b32 v46, v6 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(9) +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v43, v3 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(8) +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v47, v7 :: v_dual_mov_b32 v46, v6 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v45, v5 :: v_dual_mov_b32 v44, v4 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v43, v3 :: v_dual_mov_b32 v42, v2 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v41, v1 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(7) +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v55, v15 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v53, v13 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(6) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v51, v11 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v49, v9 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, v8 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(5) +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v63, v23 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v62, v22 :: v_dual_mov_b32 v61, v21 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(4) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v60, v20 :: v_dual_mov_b32 v59, v19 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v58, v18 :: v_dual_mov_b32 v57, v17 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v56, v16 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(3) +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v56, v16 :: v_dual_mov_b32 v71, v31 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v70, v30 :: v_dual_mov_b32 v69, v29 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(2) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v68, v28 :: v_dual_mov_b32 v67, v27 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v66, v26 :: v_dual_mov_b32 v65, v25 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v64, v24 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v64, v24 :: v_dual_mov_b32 v79, v39 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v78, v38 :: v_dual_mov_b32 v77, v37 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v76, v36 :: v_dual_mov_b32 v75, v35 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v74, v34 :: v_dual_mov_b32 v73, v33 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v72, v32 @@ -184,14 +188,16 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; GCN-NEXT: v_add_nc_u32_e32 v16, s1, v16 -; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 ; GCN-NEXT: ds_load_b128 v[0:3], v17 +; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) +; GCN-NEXT: s_waitcnt lgkmcnt(1) +; GCN-NEXT: v_mov_b32_e32 v11, v3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 -; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 -; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GCN-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v9, v1 +; GCN-NEXT: v_mov_b32_e32 v8, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) @@ -202,9 +208,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; GCN-NEXT: v_mov_b32_e32 v16, s1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 ; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -216,9 +223,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:6144 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 ; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -230,9 +238,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:12288 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 ; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -244,9 +253,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:20480 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_waitcnt lgkmcnt(1) ; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 ; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -266,14 +276,16 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v17, s0, v16 ; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v16, s1, v16 -; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 +; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v11, v3 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v9, v1 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v8, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) ; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) @@ -284,9 +296,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v16, s1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -298,9 +311,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:6144 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -312,9 +326,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:12288 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -326,9 +341,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:20480 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) -; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(1) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4 +; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2 ; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll index dcc3e0df0c744..703661e22b495 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll @@ -24,24 +24,20 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr ; GCN-NEXT: ds_load_b128 v[0:3], v0 offset:11264 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x6 -; GCN-NEXT: v_mov_b32_e32 v31, v11 +; GCN-NEXT: v_dual_mov_b32 v31, v11 :: v_dual_mov_b32 v30, v10 +; GCN-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 ; GCN-NEXT: s_wait_dscnt 0x5 -; GCN-NEXT: v_mov_b32_e32 v35, v15 +; GCN-NEXT: v_dual_mov_b32 v35, v15 :: v_dual_mov_b32 v34, v14 +; GCN-NEXT: v_dual_mov_b32 v33, v13 :: v_dual_mov_b32 v32, v12 ; GCN-NEXT: s_wait_dscnt 0x4 -; GCN-NEXT: v_mov_b32_e32 v39, v19 +; GCN-NEXT: v_dual_mov_b32 v39, v19 :: v_dual_mov_b32 v38, v18 +; GCN-NEXT: v_dual_mov_b32 v37, v17 :: v_dual_mov_b32 v36, v16 ; GCN-NEXT: s_wait_dscnt 0x3 -; GCN-NEXT: v_mov_b32_e32 v43, v23 +; GCN-NEXT: v_dual_mov_b32 v43, v23 :: v_dual_mov_b32 v42, v22 +; GCN-NEXT: v_dual_mov_b32 v41, v21 :: v_dual_mov_b32 v40, v20 ; GCN-NEXT: s_wait_dscnt 0x2 -; GCN-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10 -; GCN-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 -; GCN-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 -; GCN-NEXT: v_mov_b32_e32 v32, v12 -; GCN-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17 -; GCN-NEXT: v_mov_b32_e32 v36, v16 -; GCN-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21 -; GCN-NEXT: v_mov_b32_e32 v40, v20 -; GCN-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25 -; GCN-NEXT: v_mov_b32_e32 v44, v24 +; GCN-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v46, v26 +; GCN-NEXT: v_dual_mov_b32 v45, v25 :: v_dual_mov_b32 v44, v24 ; GCN-NEXT: s_wait_dscnt 0x0 ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48 ; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48 @@ -76,24 +72,20 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v0 offset:11264 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x6 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v31, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v31, v11 :: v_dual_mov_b32 v30, v10 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x5 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v35, v15 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v35, v15 :: v_dual_mov_b32 v34, v14 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v33, v13 :: v_dual_mov_b32 v32, v12 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x4 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v39, v19 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v39, v19 :: v_dual_mov_b32 v38, v18 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v37, v17 :: v_dual_mov_b32 v36, v16 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x3 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v43, v23 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v43, v23 :: v_dual_mov_b32 v42, v22 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v41, v21 :: v_dual_mov_b32 v40, v20 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v32, v12 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v36, v16 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v40, v20 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v44, v24 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v46, v26 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v45, v25 :: v_dual_mov_b32 v44, v24 ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48 ; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll index 4905c6d8aa81b..78ff7e7510b66 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll @@ -91,15 +91,25 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) { } define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) { -; GFX11-LABEL: test_get_tma: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA) -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: test_get_tma: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_get_tma: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm ; ; GFX1250-LABEL: test_get_tma: ; GFX1250: ; %bb.0: @@ -117,15 +127,25 @@ define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) { } define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) { -; GFX11-LABEL: test_get_realtime: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME) -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: test_get_realtime: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_get_realtime: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm ; ; GFX1250-LABEL: test_get_realtime: ; GFX1250: ; %bb.0: @@ -186,15 +206,25 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) { } define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) { -; GFX11-LABEL: test_get_tba: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA) -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: test_get_tba: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_get_tba: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm ; ; GFX1250-LABEL: test_get_tba: ; GFX1250: ; %bb.0: @@ -255,15 +285,25 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) { } define amdgpu_kernel void @test_get_99999_i64(ptr addrspace(1) %out) { -; GFX11-LABEL: test_get_99999_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: test_get_99999_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_get_99999_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm ; ; GFX1250-LABEL: test_get_99999_i64: ; GFX1250: ; %bb.0: @@ -281,15 +321,25 @@ define amdgpu_kernel void @test_get_99999_i64(ptr addrspace(1) %out) { } define amdgpu_kernel void @test_get_136_i64(ptr addrspace(1) %out) { -; GFX11-LABEL: test_get_136_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(136, 0, 0) -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: test_get_136_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(136, 0, 0) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_get_136_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(136, 0, 0) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_endpgm ; ; GFX1250-LABEL: test_get_136_i64: ; GFX1250: ; %bb.0: @@ -308,3 +358,5 @@ define amdgpu_kernel void @test_get_136_i64(ptr addrspace(1) %out) { declare i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32) declare i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 25996ee11c5a1..1c3c6e3bc3489 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s + declare i32 @llvm.amdgcn.workitem.id.x() ; -------------------------------------------------------------------- @@ -44,20 +45,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b32_e32 v16, s16 +; GISEL-NEXT: v_mov_b32_e32 v12, s16 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -474,7 +475,6 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, ; GISEL-NEXT: v_mov_b32_e32 v18, v7 ; GISEL-NEXT: v_mov_b32_e32 v19, v8 ; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v21, v10 ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 @@ -482,6 +482,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, ; GISEL-NEXT: v_mov_b32_e32 v3, s27 ; GISEL-NEXT: v_mov_b32_e32 v4, s28 ; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] @@ -797,26 +798,26 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v12, 0 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s0 -; SDAG-NEXT: v_mov_b32_e32 v5, s1 -; SDAG-NEXT: v_mov_b32_e32 v6, s2 -; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s16 +; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v14, s8 +; SDAG-NEXT: v_mov_b32_e32 v15, s9 +; SDAG-NEXT: v_mov_b32_e32 v16, s10 +; SDAG-NEXT: v_mov_b32_e32 v17, s11 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s0 +; SDAG-NEXT: v_mov_b32_e32 v7, s1 +; SDAG-NEXT: v_mov_b32_e32 v8, s2 +; SDAG-NEXT: v_mov_b32_e32 v9, s3 +; SDAG-NEXT: v_mov_b32_e32 v1, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__vgpr: @@ -828,20 +829,20 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] +; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v16, s2 +; GISEL-NEXT: v_mov_b32_e32 v12, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -963,22 +964,22 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v24, s8 -; SDAG-NEXT: v_mov_b32_e32 v25, s9 -; SDAG-NEXT: v_mov_b32_e32 v26, s10 -; SDAG-NEXT: v_mov_b32_e32 v27, s11 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s0 -; SDAG-NEXT: v_mov_b32_e32 v21, s1 -; SDAG-NEXT: v_mov_b32_e32 v22, s2 -; SDAG-NEXT: v_mov_b32_e32 v23, s3 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v26, s8 +; SDAG-NEXT: v_mov_b32_e32 v27, s9 +; SDAG-NEXT: v_mov_b32_e32 v28, s10 +; SDAG-NEXT: v_mov_b32_e32 v29, s11 +; SDAG-NEXT: v_mov_b32_e32 v18, s12 +; SDAG-NEXT: v_mov_b32_e32 v19, s13 +; SDAG-NEXT: v_mov_b32_e32 v20, s14 +; SDAG-NEXT: v_mov_b32_e32 v21, s15 +; SDAG-NEXT: v_mov_b32_e32 v22, s0 +; SDAG-NEXT: v_mov_b32_e32 v23, s1 +; SDAG-NEXT: v_mov_b32_e32 v24, s2 +; SDAG-NEXT: v_mov_b32_e32 v25, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 @@ -1264,7 +1265,6 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; GISEL-NEXT: v_mov_b32_e32 v18, v7 ; GISEL-NEXT: v_mov_b32_e32 v19, v8 ; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v21, v10 ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 @@ -1272,6 +1272,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; GISEL-NEXT: v_mov_b32_e32 v3, s27 ; GISEL-NEXT: v_mov_b32_e32 v4, s28 ; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] @@ -1308,26 +1309,26 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v12, 0 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s0 -; SDAG-NEXT: v_mov_b32_e32 v5, s1 -; SDAG-NEXT: v_mov_b32_e32 v6, s2 -; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s16 +; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v14, s8 +; SDAG-NEXT: v_mov_b32_e32 v15, s9 +; SDAG-NEXT: v_mov_b32_e32 v16, s10 +; SDAG-NEXT: v_mov_b32_e32 v17, s11 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s0 +; SDAG-NEXT: v_mov_b32_e32 v7, s1 +; SDAG-NEXT: v_mov_b32_e32 v8, s2 +; SDAG-NEXT: v_mov_b32_e32 v9, s3 +; SDAG-NEXT: v_mov_b32_e32 v1, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr: @@ -1339,20 +1340,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] +; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v16, s2 +; GISEL-NEXT: v_mov_b32_e32 v12, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1470,26 +1471,26 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v12, 0 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s0 -; SDAG-NEXT: v_mov_b32_e32 v5, s1 -; SDAG-NEXT: v_mov_b32_e32 v6, s2 -; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s16 +; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v14, s8 +; SDAG-NEXT: v_mov_b32_e32 v15, s9 +; SDAG-NEXT: v_mov_b32_e32 v16, s10 +; SDAG-NEXT: v_mov_b32_e32 v17, s11 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s0 +; SDAG-NEXT: v_mov_b32_e32 v7, s1 +; SDAG-NEXT: v_mov_b32_e32 v8, s2 +; SDAG-NEXT: v_mov_b32_e32 v9, s3 +; SDAG-NEXT: v_mov_b32_e32 v1, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr: @@ -1501,20 +1502,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] +; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v16, s2 +; GISEL-NEXT: v_mov_b32_e32 v12, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1632,26 +1633,26 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v12, 0 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s0 -; SDAG-NEXT: v_mov_b32_e32 v5, s1 -; SDAG-NEXT: v_mov_b32_e32 v6, s2 -; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s16 +; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v14, s8 +; SDAG-NEXT: v_mov_b32_e32 v15, s9 +; SDAG-NEXT: v_mov_b32_e32 v16, s10 +; SDAG-NEXT: v_mov_b32_e32 v17, s11 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s0 +; SDAG-NEXT: v_mov_b32_e32 v7, s1 +; SDAG-NEXT: v_mov_b32_e32 v8, s2 +; SDAG-NEXT: v_mov_b32_e32 v9, s3 +; SDAG-NEXT: v_mov_b32_e32 v1, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr: @@ -1663,20 +1664,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] +; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v16, s2 +; GISEL-NEXT: v_mov_b32_e32 v12, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1794,26 +1795,26 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v12, 0 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s0 -; SDAG-NEXT: v_mov_b32_e32 v5, s1 -; SDAG-NEXT: v_mov_b32_e32 v6, s2 -; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v13, s16 +; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v14, s8 +; SDAG-NEXT: v_mov_b32_e32 v15, s9 +; SDAG-NEXT: v_mov_b32_e32 v16, s10 +; SDAG-NEXT: v_mov_b32_e32 v17, s11 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s0 +; SDAG-NEXT: v_mov_b32_e32 v7, s1 +; SDAG-NEXT: v_mov_b32_e32 v8, s2 +; SDAG-NEXT: v_mov_b32_e32 v9, s3 +; SDAG-NEXT: v_mov_b32_e32 v1, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr: @@ -1825,20 +1826,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] +; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v16, s2 +; GISEL-NEXT: v_mov_b32_e32 v12, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1960,22 +1961,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v24, s8 -; SDAG-NEXT: v_mov_b32_e32 v25, s9 -; SDAG-NEXT: v_mov_b32_e32 v26, s10 -; SDAG-NEXT: v_mov_b32_e32 v27, s11 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s0 -; SDAG-NEXT: v_mov_b32_e32 v21, s1 -; SDAG-NEXT: v_mov_b32_e32 v22, s2 -; SDAG-NEXT: v_mov_b32_e32 v23, s3 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v26, s8 +; SDAG-NEXT: v_mov_b32_e32 v27, s9 +; SDAG-NEXT: v_mov_b32_e32 v28, s10 +; SDAG-NEXT: v_mov_b32_e32 v29, s11 +; SDAG-NEXT: v_mov_b32_e32 v18, s12 +; SDAG-NEXT: v_mov_b32_e32 v19, s13 +; SDAG-NEXT: v_mov_b32_e32 v20, s14 +; SDAG-NEXT: v_mov_b32_e32 v21, s15 +; SDAG-NEXT: v_mov_b32_e32 v22, s0 +; SDAG-NEXT: v_mov_b32_e32 v23, s1 +; SDAG-NEXT: v_mov_b32_e32 v24, s2 +; SDAG-NEXT: v_mov_b32_e32 v25, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 @@ -2261,7 +2262,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v18, v7 ; GISEL-NEXT: v_mov_b32_e32 v19, v8 ; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v21, v10 ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 @@ -2269,6 +2269,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v3, s27 ; GISEL-NEXT: v_mov_b32_e32 v4, s28 ; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] @@ -2309,22 +2310,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v24, s8 -; SDAG-NEXT: v_mov_b32_e32 v25, s9 -; SDAG-NEXT: v_mov_b32_e32 v26, s10 -; SDAG-NEXT: v_mov_b32_e32 v27, s11 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s0 -; SDAG-NEXT: v_mov_b32_e32 v21, s1 -; SDAG-NEXT: v_mov_b32_e32 v22, s2 -; SDAG-NEXT: v_mov_b32_e32 v23, s3 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v26, s8 +; SDAG-NEXT: v_mov_b32_e32 v27, s9 +; SDAG-NEXT: v_mov_b32_e32 v28, s10 +; SDAG-NEXT: v_mov_b32_e32 v29, s11 +; SDAG-NEXT: v_mov_b32_e32 v18, s12 +; SDAG-NEXT: v_mov_b32_e32 v19, s13 +; SDAG-NEXT: v_mov_b32_e32 v20, s14 +; SDAG-NEXT: v_mov_b32_e32 v21, s15 +; SDAG-NEXT: v_mov_b32_e32 v22, s0 +; SDAG-NEXT: v_mov_b32_e32 v23, s1 +; SDAG-NEXT: v_mov_b32_e32 v24, s2 +; SDAG-NEXT: v_mov_b32_e32 v25, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 @@ -2610,7 +2611,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v18, v7 ; GISEL-NEXT: v_mov_b32_e32 v19, v8 ; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v21, v10 ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 @@ -2618,6 +2618,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v3, s27 ; GISEL-NEXT: v_mov_b32_e32 v4, s28 ; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] @@ -2658,22 +2659,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v24, s8 -; SDAG-NEXT: v_mov_b32_e32 v25, s9 -; SDAG-NEXT: v_mov_b32_e32 v26, s10 -; SDAG-NEXT: v_mov_b32_e32 v27, s11 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s0 -; SDAG-NEXT: v_mov_b32_e32 v21, s1 -; SDAG-NEXT: v_mov_b32_e32 v22, s2 -; SDAG-NEXT: v_mov_b32_e32 v23, s3 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v26, s8 +; SDAG-NEXT: v_mov_b32_e32 v27, s9 +; SDAG-NEXT: v_mov_b32_e32 v28, s10 +; SDAG-NEXT: v_mov_b32_e32 v29, s11 +; SDAG-NEXT: v_mov_b32_e32 v18, s12 +; SDAG-NEXT: v_mov_b32_e32 v19, s13 +; SDAG-NEXT: v_mov_b32_e32 v20, s14 +; SDAG-NEXT: v_mov_b32_e32 v21, s15 +; SDAG-NEXT: v_mov_b32_e32 v22, s0 +; SDAG-NEXT: v_mov_b32_e32 v23, s1 +; SDAG-NEXT: v_mov_b32_e32 v24, s2 +; SDAG-NEXT: v_mov_b32_e32 v25, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 @@ -2959,7 +2960,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v18, v7 ; GISEL-NEXT: v_mov_b32_e32 v19, v8 ; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v21, v10 ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 @@ -2967,6 +2967,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v3, s27 ; GISEL-NEXT: v_mov_b32_e32 v4, s28 ; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] @@ -3007,22 +3008,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v24, s8 -; SDAG-NEXT: v_mov_b32_e32 v25, s9 -; SDAG-NEXT: v_mov_b32_e32 v26, s10 -; SDAG-NEXT: v_mov_b32_e32 v27, s11 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s0 -; SDAG-NEXT: v_mov_b32_e32 v21, s1 -; SDAG-NEXT: v_mov_b32_e32 v22, s2 -; SDAG-NEXT: v_mov_b32_e32 v23, s3 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v26, s8 +; SDAG-NEXT: v_mov_b32_e32 v27, s9 +; SDAG-NEXT: v_mov_b32_e32 v28, s10 +; SDAG-NEXT: v_mov_b32_e32 v29, s11 +; SDAG-NEXT: v_mov_b32_e32 v18, s12 +; SDAG-NEXT: v_mov_b32_e32 v19, s13 +; SDAG-NEXT: v_mov_b32_e32 v20, s14 +; SDAG-NEXT: v_mov_b32_e32 v21, s15 +; SDAG-NEXT: v_mov_b32_e32 v22, s0 +; SDAG-NEXT: v_mov_b32_e32 v23, s1 +; SDAG-NEXT: v_mov_b32_e32 v24, s2 +; SDAG-NEXT: v_mov_b32_e32 v25, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 @@ -3308,7 +3309,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v18, v7 ; GISEL-NEXT: v_mov_b32_e32 v19, v8 ; GISEL-NEXT: v_mov_b32_e32 v20, v9 -; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b32_e32 v21, v10 ; GISEL-NEXT: v_mov_b32_e32 v0, s24 ; GISEL-NEXT: v_mov_b32_e32 v1, s25 @@ -3316,6 +3316,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b32_e32 v3, s27 ; GISEL-NEXT: v_mov_b32_e32 v4, s28 ; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] @@ -3336,4 +3337,4 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ret <16 x float> %result } -attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0,0" } +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll index 037e26087eaa5..a482af1e41afe 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll @@ -46,14 +46,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_splat(<2 x float> %A, <2 x ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -89,15 +89,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_inlineable(<2 x float> %A, ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -406,14 +406,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8 ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -449,15 +449,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_inlineable(<8 x i32> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -516,14 +516,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8 ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -559,15 +559,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_inlineable(<8 x i32> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -626,14 +626,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8 ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -669,15 +669,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_inlineable(<8 x i32> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -736,14 +736,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8 ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -779,15 +779,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_inlineable(<8 x i32> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -841,9 +841,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -870,11 +870,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_inlineable(<8 x i32> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_mov_b32 s1, s0 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -923,9 +923,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -952,11 +952,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_inlineable(<8 x i32> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_mov_b32 s1, s0 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -1005,9 +1005,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -1034,11 +1034,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_inlineable(<8 x i32> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_mov_b32 s1, s0 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -1087,9 +1087,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -1116,11 +1116,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_inlineable(<8 x i32> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_mov_b32 s1, s0 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -1174,14 +1174,14 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_splat(<8 x i32> %A, <8 x i ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1 ; GISEL-NEXT: s_mov_b32 s2, 2 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1217,15 +1217,15 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_inlineable(<8 x i32> %A, < ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_movk_i32 s0, 0x80 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1284,14 +1284,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_splat(<16 x half> %A, <16 ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1327,15 +1327,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_inlineable(<16 x half> %A, ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1389,9 +1389,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_splat(<16 x half> %A, <16 ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -1418,11 +1418,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_inlineable(<16 x half> %A, ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_mov_b32 s1, s0 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] ; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21] ; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off ; GISEL-NEXT: s_endpgm @@ -1476,14 +1476,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, < ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s1, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1519,15 +1519,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1586,14 +1586,14 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_splat(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s1, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1630,15 +1630,15 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable(<16 x ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: v_mov_b32_e32 v42, 0x64 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: v_mov_b32_e32 v43, 0x65 @@ -1698,14 +1698,14 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_splat(<16 x i3 ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s1, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1743,15 +1743,15 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable(<16 ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x64 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[44:45], 0x65 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] @@ -1806,9 +1806,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -1835,11 +1835,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_inlineable(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_mov_b32 s1, s0 -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -1888,9 +1888,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -1917,11 +1917,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_inlineable(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_mov_b32 s1, s0 -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -1970,9 +1970,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -1999,11 +1999,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_inlineable(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_mov_b32 s1, s0 -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -2052,9 +2052,9 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -2081,11 +2081,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_inlineable(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x42004200 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 -; GISEL-NEXT: s_mov_b32 s1, s0 -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] ; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off ; GISEL-NEXT: s_endpgm @@ -2139,14 +2139,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2182,15 +2182,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_inlineable(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2249,14 +2249,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2292,15 +2292,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_inlineable(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2359,14 +2359,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2402,15 +2402,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_inlineable(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2469,14 +2469,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2512,15 +2512,15 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_inlineable(<16 x i32> ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s6, s0 -; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 ; GISEL-NEXT: s_mov_b32 s5, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2590,8 +2590,6 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s2, 2.0 -; GISEL-NEXT: s_mov_b32 s14, s0 -; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 @@ -2604,8 +2602,10 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x ; GISEL-NEXT: s_mov_b32 s11, s0 ; GISEL-NEXT: s_mov_b32 s12, s0 ; GISEL-NEXT: s_mov_b32 s13, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] @@ -2654,8 +2654,6 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_mov_b32 s14, s0 -; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 @@ -2669,8 +2667,10 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, ; GISEL-NEXT: s_mov_b32 s11, s0 ; GISEL-NEXT: s_mov_b32 s12, s0 ; GISEL-NEXT: s_mov_b32 s13, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] @@ -2746,8 +2746,6 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s1, 2.0 -; GISEL-NEXT: s_mov_b32 s14, s0 -; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 @@ -2760,8 +2758,10 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_splat(<16 x i32> %A, ; GISEL-NEXT: s_mov_b32 s11, s0 ; GISEL-NEXT: s_mov_b32 s12, s0 ; GISEL-NEXT: s_mov_b32 s13, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] @@ -2811,8 +2811,6 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32 ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: v_mov_b32_e32 v42, 0x64 -; GISEL-NEXT: s_mov_b32 s14, s0 -; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 @@ -2826,8 +2824,10 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32 ; GISEL-NEXT: s_mov_b32 s11, s0 ; GISEL-NEXT: s_mov_b32 s12, s0 ; GISEL-NEXT: s_mov_b32 s13, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] @@ -2904,8 +2904,6 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_splat(<16 x i32> % ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 1.0 ; GISEL-NEXT: s_mov_b32 s1, 2.0 -; GISEL-NEXT: s_mov_b32 s14, s0 -; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 ; GISEL-NEXT: s_mov_b32 s4, s0 @@ -2918,8 +2916,10 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_splat(<16 x i32> % ; GISEL-NEXT: s_mov_b32 s11, s0 ; GISEL-NEXT: s_mov_b32 s12, s0 ; GISEL-NEXT: s_mov_b32 s13, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] @@ -2970,8 +2970,6 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i ; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GISEL-NEXT: s_mov_b32 s0, 0x40400000 ; GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x64 -; GISEL-NEXT: s_mov_b32 s14, s0 -; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: s_mov_b32 s1, s0 ; GISEL-NEXT: s_mov_b32 s2, s0 ; GISEL-NEXT: s_mov_b32 s3, s0 @@ -2985,8 +2983,10 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i ; GISEL-NEXT: s_mov_b32 s11, s0 ; GISEL-NEXT: s_mov_b32 s12, s0 ; GISEL-NEXT: s_mov_b32 s13, s0 -; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 ; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index 2752649550b69..5f08f5970a0e6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -534,8 +534,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -590,8 +590,8 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -656,14 +656,14 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX802-SDAG-NEXT: flat_load_dword v0, v[0:1] ; GFX802-SDAG-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX802-SDAG-NEXT: s_nop 2 +; GFX802-SDAG-NEXT: s_nop 3 ; GFX802-SDAG-NEXT: v_writelane_b32 v2, 12, s2 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2 ; GFX802-SDAG-NEXT: s_endpgm ; @@ -719,14 +719,14 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX802-GISEL-NEXT: flat_load_dword v0, v[0:1] ; GFX802-GISEL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX802-GISEL-NEXT: v_readfirstlane_b32 s2, v0 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX802-GISEL-NEXT: s_nop 2 +; GFX802-GISEL-NEXT: s_nop 3 ; GFX802-GISEL-NEXT: v_writelane_b32 v2, 12, s2 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX802-GISEL-NEXT: s_endpgm ; @@ -791,13 +791,13 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1] ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: v_readfirstlane_b32 s2, v2 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; GFX802-SDAG-NEXT: s_nop 2 +; GFX802-SDAG-NEXT: s_nop 3 ; GFX802-SDAG-NEXT: v_writelane_b32 v1, 0, s2 ; GFX802-SDAG-NEXT: v_writelane_b32 v0, 12, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 @@ -937,14 +937,14 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1] ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX802-SDAG-NEXT: v_readfirstlane_b32 s2, v2 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s2 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; GFX802-SDAG-NEXT: s_nop 1 +; GFX802-SDAG-NEXT: s_nop 2 ; GFX802-SDAG-NEXT: v_writelane_b32 v0, 0, s2 ; GFX802-SDAG-NEXT: v_writelane_b32 v1, s4, m0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 @@ -1087,9 +1087,9 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s2 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3 ; GFX802-SDAG-NEXT: v_writelane_b32 v2, s4, m0 @@ -1143,9 +1143,9 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s2 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0 @@ -1496,8 +1496,8 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 %oldval, ptr addrs ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s4 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, m0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2 ; GFX802-SDAG-NEXT: s_endpgm @@ -1536,8 +1536,8 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 %oldval, ptr addrs ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX802-GISEL-NEXT: s_endpgm @@ -1631,10 +1631,10 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 %oldval, ptr addrs ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3 ; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX802-GISEL-NEXT: s_endpgm @@ -1734,10 +1734,10 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double %oldval, ptr ad ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3 ; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX802-GISEL-NEXT: s_endpgm @@ -1786,8 +1786,8 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-SDAG-NEXT: v_writelane_b32 v2, s2, m0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2 ; GFX802-SDAG-NEXT: s_endpgm @@ -1821,8 +1821,8 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_writelane_b32 v2, s2, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX802-GISEL-NEXT: s_endpgm @@ -1907,11 +1907,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX802-GISEL-NEXT: s_endpgm @@ -2004,11 +2004,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX802-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 3897a0e028334..ccf85a0d8e45e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -382,8 +382,8 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm @@ -434,8 +434,8 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v4 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -916,8 +916,8 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v5 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s5 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm @@ -988,8 +988,8 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) { ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v4 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5 ; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-GISEL-NEXT: s_endpgm @@ -1674,8 +1674,8 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm @@ -1765,8 +1765,8 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) { ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v5 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 ; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index 574b1c0b4974c..7c5d38c73dd2c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -384,8 +384,8 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm @@ -436,8 +436,8 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v4 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm @@ -918,8 +918,8 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v5 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s5 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm @@ -990,8 +990,8 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v4 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s4 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, s5 ; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-GISEL-NEXT: s_endpgm @@ -1676,8 +1676,8 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v5 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v6 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm @@ -1767,8 +1767,8 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v5 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s4 ; VI-GISEL-NEXT: v_mov_b32_e32 v5, s5 ; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index dd44a1a35067e..27aeae985d1e8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -61,9 +61,9 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) { ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec ; VI-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: v_ldexp_f32 v2, v0, s2 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 ; VI-SDAG-NEXT: s_endpgm ; @@ -240,8 +240,8 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-NEXT: v_ldexp_f32 v1, v2, s3 ; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec ; VI-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s2 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm @@ -465,18 +465,18 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc ; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v3, s1, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_exp_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v3, s1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_exp_f32_e32 v3, v3 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_cselect_b32 s4, 0xffffffc0, 0 ; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; VI-SDAG-NEXT: v_ldexp_f32 v1, v3, s4 ; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 +; VI-SDAG-NEXT: v_ldexp_f32 v1, v3, s4 +; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s0 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3 -; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s0 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm @@ -765,17 +765,17 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc ; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v4, s1, v4 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v4, s1, v4 ; VI-SDAG-NEXT: v_add_f32_e32 v0, s0, v0 +; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4 ; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; VI-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0 ; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec -; VI-SDAG-NEXT: v_ldexp_f32 v1, v4, s2 ; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 +; VI-SDAG-NEXT: v_ldexp_f32 v1, v4, s2 ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll index 956145fb24c4a..ee53332f2f786 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll @@ -36,8 +36,8 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; GFX8CHECK-NEXT: s_and_b32 s2, s2, 0x7fff ; GFX8CHECK-NEXT: s_cmpk_gt_i32 s2, 0x7f80 ; GFX8CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0 ; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] +; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0 ; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1 ; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2 ; GFX8CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index dd19ba17bb292..edc200d757737 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -51,8 +51,8 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX8SELDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8SELDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX8SELDAG-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3 -; GFX8SELDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX8SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] +; GFX8SELDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX8SELDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX8SELDAG-NEXT: flat_store_dword v[0:1], v2 ; GFX8SELDAG-NEXT: s_endpgm @@ -68,8 +68,8 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX8GLISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX8GLISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8GLISEL-NEXT: s_cselect_b32 s2, -1, 0 -; GFX8GLISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GLISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GLISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GLISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GLISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GLISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll index 0a9fe10874c38..3d42c0bdf5dfb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll @@ -49,8 +49,8 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; GFX8SELDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8SELDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX8SELDAG-NEXT: v_cmp_class_f32_e64 s[2:3], s2, 3 -; GFX8SELDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX8SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] +; GFX8SELDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX8SELDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX8SELDAG-NEXT: flat_store_dword v[0:1], v2 ; GFX8SELDAG-NEXT: s_endpgm @@ -66,8 +66,8 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) { ; GFX8GLISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX8GLISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8GLISEL-NEXT: s_cselect_b32 s2, -1, 0 -; GFX8GLISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GLISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GLISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX8GLISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX8GLISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8GLISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index e24fd1f22bfa6..6e8e3a9baef0e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -285,12 +285,12 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-NEXT: s_cselect_b32 s3, 32, 0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 +; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; VI-SDAG-NEXT: v_log_f32_e32 v4, v1 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v3, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm @@ -580,14 +580,14 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in) ; VI-SDAG-NEXT: s_cselect_b32 s1, 32, 0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; VI-SDAG-NEXT: v_log_f32_e32 v5, v5 -; VI-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v6, v1 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v3, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v5, v4 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v6, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, s5 ; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-SDAG-NEXT: s_endpgm @@ -967,16 +967,16 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; VI-SDAG-NEXT: s_cselect_b32 s1, 32, 0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 ; VI-SDAG-NEXT: v_log_f32_e32 v5, v5 -; VI-SDAG-NEXT: v_ldexp_f32 v1, s0, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v7, v7 ; VI-SDAG-NEXT: v_log_f32_e32 v8, v1 ; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v2 ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v5, v4 -; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v7, v6 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v8, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, s4 ; VI-SDAG-NEXT: v_mov_b32_e32 v5, s5 ; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-SDAG-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index 76b97e843d777..37250ca94f42a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -19,8 +19,8 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; SI-NEXT: s_cselect_b32 s4, 0, s4 ; SI-NEXT: s_cselect_b32 s5, s8, s5 ; SI-NEXT: s_cmp_gt_i32 s7, 51 -; SI-NEXT: s_cselect_b32 s8, s2, s4 ; SI-NEXT: s_cselect_b32 s9, s3, s5 +; SI-NEXT: s_cselect_b32 s8, s2, s4 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_f64 v[0:1], s[2:3], -v[0:1] @@ -150,8 +150,8 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) ; SI-NEXT: s_cselect_b32 s0, 0, s0 ; SI-NEXT: s_cselect_b32 s1, s3, s1 ; SI-NEXT: s_cmp_gt_i32 s12, 51 -; SI-NEXT: s_cselect_b32 s12, s10, s0 ; SI-NEXT: s_cselect_b32 s13, s11, s1 +; SI-NEXT: s_cselect_b32 s12, s10, s0 ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] @@ -170,8 +170,8 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) ; SI-NEXT: s_cselect_b32 s4, 0, s4 ; SI-NEXT: s_cselect_b32 s5, s6, s5 ; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s4, s8, s4 ; SI-NEXT: s_cselect_b32 s5, s9, s5 +; SI-NEXT: s_cselect_b32 s4, s8, s4 ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: v_add_f64 v[2:3], s[8:9], -v[2:3] @@ -243,8 +243,8 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; SI-NEXT: s_cselect_b32 s0, 0, s0 ; SI-NEXT: s_cselect_b32 s1, s3, s1 ; SI-NEXT: s_cmp_gt_i32 s16, 51 -; SI-NEXT: s_cselect_b32 s16, s10, s0 ; SI-NEXT: s_cselect_b32 s17, s11, s1 +; SI-NEXT: s_cselect_b32 s16, s10, s0 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] @@ -264,9 +264,9 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; SI-NEXT: s_cselect_b32 s5, s10, s5 ; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_brev_b32 s18, -2 +; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: s_cselect_b32 s4, s8, s4 ; SI-NEXT: v_bfi_b32 v5, s18, v0, v1 -; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_add_f64 v[0:1], s[8:9], -v[0:1] @@ -285,8 +285,8 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; SI-NEXT: s_cselect_b32 s8, 0, s8 ; SI-NEXT: s_cselect_b32 s9, s10, s9 ; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s8, s14, s8 ; SI-NEXT: s_cselect_b32 s9, s15, s9 +; SI-NEXT: s_cselect_b32 s8, s14, s8 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_f64 v[0:1], s[14:15], -v[0:1] @@ -397,8 +397,8 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: s_cselect_b32 s0, 0, s0 ; SI-NEXT: s_cselect_b32 s1, s3, s1 ; SI-NEXT: s_cmp_gt_i32 s24, 51 -; SI-NEXT: s_cselect_b32 s24, s10, s0 ; SI-NEXT: s_cselect_b32 s25, s11, s1 +; SI-NEXT: s_cselect_b32 s24, s10, s0 ; SI-NEXT: v_mov_b32_e32 v0, s24 ; SI-NEXT: v_mov_b32_e32 v1, s25 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] @@ -418,9 +418,9 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: s_cselect_b32 s5, s11, s5 ; SI-NEXT: s_cmp_gt_i32 s10, 51 ; SI-NEXT: s_brev_b32 s3, -2 +; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: s_cselect_b32 s4, s8, s4 ; SI-NEXT: v_bfi_b32 v9, s3, v0, v1 -; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_add_f64 v[0:1], s[8:9], -v[0:1] @@ -439,8 +439,8 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: s_cselect_b32 s8, 0, s8 ; SI-NEXT: s_cselect_b32 s9, s11, s9 ; SI-NEXT: s_cmp_gt_i32 s10, 51 -; SI-NEXT: s_cselect_b32 s8, s14, s8 ; SI-NEXT: s_cselect_b32 s9, s15, s9 +; SI-NEXT: s_cselect_b32 s8, s14, s8 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_f64 v[0:1], s[14:15], -v[0:1] @@ -459,8 +459,8 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: s_cselect_b32 s4, 0, s4 ; SI-NEXT: s_cselect_b32 s5, s11, s5 ; SI-NEXT: s_cmp_gt_i32 s10, 51 -; SI-NEXT: s_cselect_b32 s4, s12, s4 ; SI-NEXT: s_cselect_b32 s5, s13, s5 +; SI-NEXT: s_cselect_b32 s4, s12, s4 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_mov_b32_e32 v5, s5 ; SI-NEXT: v_add_f64 v[4:5], s[12:13], -v[4:5] @@ -480,8 +480,8 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: s_cselect_b32 s8, 0, s8 ; SI-NEXT: s_cselect_b32 s9, s11, s9 ; SI-NEXT: s_cmp_gt_i32 s10, 51 -; SI-NEXT: s_cselect_b32 s8, s18, s8 ; SI-NEXT: s_cselect_b32 s9, s19, s9 +; SI-NEXT: s_cselect_b32 s8, s18, s8 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_add_f64 v[4:5], s[18:19], -v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll index 001d7487b51b4..cb1d68936a1cf 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll @@ -53,9 +53,9 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %ld = load double, ptr addrspace(4) %in diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 81e407de9c324..082d0d5957bf3 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -616,9 +616,9 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm ; @@ -1551,8 +1551,8 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: v_mov_b32_e32 v8, s0 +; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10003 @@ -1562,13 +1562,13 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; GFX8-NEXT: s_bfe_u32 s7, s2, 0x10002 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x10004 ; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 7, v0 ; GFX8-NEXT: v_bfe_u32 v2, v0, 6, 1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v11, s1 ; GFX8-NEXT: v_mov_b32_e32 v10, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 @@ -1705,8 +1705,8 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v11, s3 -; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: v_mov_b32_e32 v10, s2 +; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: v_mov_b32_e32 v8, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v3, v4, 3, 1 @@ -1858,8 +1858,8 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: v_mov_b32_e32 v16, s0 +; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s2 @@ -1883,28 +1883,28 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v18, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v19, s3 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: v_mov_b32_e32 v18, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_add_u32 s0, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: v_mov_b32_e32 v6, s10 ; GFX8-NEXT: v_mov_b32_e32 v4, s17 ; GFX8-NEXT: v_mov_b32_e32 v7, s12 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v11, s5 ; GFX8-NEXT: v_mov_b32_e32 v8, s16 ; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: v_mov_b32_e32 v10, s15 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v12, s9 ; GFX8-NEXT: v_mov_b32_e32 v13, s4 @@ -2099,16 +2099,16 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX8-NEXT: flat_load_ushort v18, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, s3 ; GFX8-NEXT: v_mov_b32_e32 v8, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_mov_b32_e32 v13, s1 +; GFX8-NEXT: v_mov_b32_e32 v9, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v12, s0 ; GFX8-NEXT: s_add_u32 s0, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v13, s1 ; GFX8-NEXT: v_mov_b32_e32 v15, s3 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v14, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: v_mov_b32_e32 v16, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2417,56 +2417,56 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v0, s33 ; GFX8-NEXT: v_mov_b32_e32 v1, s16 ; GFX8-NEXT: v_mov_b32_e32 v2, s31 ; GFX8-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v0, s30 ; GFX8-NEXT: v_mov_b32_e32 v1, s29 ; GFX8-NEXT: v_mov_b32_e32 v2, s28 ; GFX8-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v0, s27 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mov_b32_e32 v2, s26 ; GFX8-NEXT: v_mov_b32_e32 v3, s12 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v0, s25 ; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: v_mov_b32_e32 v2, s24 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s23 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: v_mov_b32_e32 v2, s22 ; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s3 @@ -2856,56 +2856,56 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v1, s34 ; GFX8-NEXT: v_mov_b32_e32 v2, s33 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v0, s31 ; GFX8-NEXT: v_mov_b32_e32 v1, s30 ; GFX8-NEXT: v_mov_b32_e32 v2, s29 ; GFX8-NEXT: v_mov_b32_e32 v3, s28 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v0, s27 ; GFX8-NEXT: v_mov_b32_e32 v1, s26 ; GFX8-NEXT: v_mov_b32_e32 v2, s25 ; GFX8-NEXT: v_mov_b32_e32 v3, s24 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v0, s23 ; GFX8-NEXT: v_mov_b32_e32 v1, s22 ; GFX8-NEXT: v_mov_b32_e32 v2, s21 ; GFX8-NEXT: v_mov_b32_e32 v3, s20 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v0, s19 ; GFX8-NEXT: v_mov_b32_e32 v1, s18 ; GFX8-NEXT: v_mov_b32_e32 v2, s17 ; GFX8-NEXT: v_mov_b32_e32 v3, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s15 ; GFX8-NEXT: v_mov_b32_e32 v1, s14 ; GFX8-NEXT: v_mov_b32_e32 v2, s13 ; GFX8-NEXT: v_mov_b32_e32 v3, s12 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s3 @@ -3437,93 +3437,93 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v0, s27 ; GFX8-NEXT: s_addc_u32 s27, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xe0 ; GFX8-NEXT: v_mov_b32_e32 v1, s44 ; GFX8-NEXT: v_mov_b32_e32 v3, s43 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xe0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xd0 ; GFX8-NEXT: v_mov_b32_e32 v0, s66 ; GFX8-NEXT: v_mov_b32_e32 v1, s42 ; GFX8-NEXT: v_mov_b32_e32 v2, s65 ; GFX8-NEXT: v_mov_b32_e32 v3, s41 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xd0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xc0 ; GFX8-NEXT: v_mov_b32_e32 v0, s64 ; GFX8-NEXT: v_mov_b32_e32 v1, s63 ; GFX8-NEXT: v_mov_b32_e32 v2, s62 ; GFX8-NEXT: v_mov_b32_e32 v3, s40 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xc0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xb0 ; GFX8-NEXT: v_mov_b32_e32 v0, s61 ; GFX8-NEXT: v_mov_b32_e32 v1, s39 ; GFX8-NEXT: v_mov_b32_e32 v2, s60 ; GFX8-NEXT: v_mov_b32_e32 v3, s38 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xb0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xa0 ; GFX8-NEXT: v_mov_b32_e32 v0, s59 ; GFX8-NEXT: v_mov_b32_e32 v1, s37 ; GFX8-NEXT: v_mov_b32_e32 v2, s58 ; GFX8-NEXT: v_mov_b32_e32 v3, s36 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xa0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0x90 ; GFX8-NEXT: v_mov_b32_e32 v0, s57 ; GFX8-NEXT: v_mov_b32_e32 v1, s35 ; GFX8-NEXT: v_mov_b32_e32 v2, s56 ; GFX8-NEXT: v_mov_b32_e32 v3, s34 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0x90 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0x80 ; GFX8-NEXT: v_mov_b32_e32 v0, s55 ; GFX8-NEXT: v_mov_b32_e32 v1, s33 ; GFX8-NEXT: v_mov_b32_e32 v2, s54 ; GFX8-NEXT: v_mov_b32_e32 v3, s31 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0x80 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v0, s52 ; GFX8-NEXT: v_mov_b32_e32 v1, s30 ; GFX8-NEXT: v_mov_b32_e32 v2, s53 ; GFX8-NEXT: v_mov_b32_e32 v3, s29 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0x70 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v0, s51 ; GFX8-NEXT: v_mov_b32_e32 v1, s28 ; GFX8-NEXT: v_mov_b32_e32 v2, s50 ; GFX8-NEXT: v_mov_b32_e32 v3, s25 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0x60 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v0, s49 ; GFX8-NEXT: v_mov_b32_e32 v1, s23 ; GFX8-NEXT: v_mov_b32_e32 v2, s48 ; GFX8-NEXT: v_mov_b32_e32 v3, s21 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0x50 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 ; GFX8-NEXT: v_mov_b32_e32 v0, s47 ; GFX8-NEXT: v_mov_b32_e32 v1, s46 @@ -3535,30 +3535,30 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v2, s22 ; GFX8-NEXT: s_add_u32 s22, s0, 64 ; GFX8-NEXT: s_addc_u32 s23, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NEXT: v_mov_b32_e32 v5, s23 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: s_add_u32 s18, s0, 48 ; GFX8-NEXT: s_addc_u32 s19, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NEXT: v_mov_b32_e32 v5, s19 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NEXT: s_add_u32 s14, s0, 32 ; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NEXT: v_mov_b32_e32 v5, s15 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -3566,9 +3566,9 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s4, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 @@ -4274,84 +4274,84 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v0, s27 ; GFX8-NEXT: s_addc_u32 s27, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xe0 ; GFX8-NEXT: v_mov_b32_e32 v1, s66 ; GFX8-NEXT: v_mov_b32_e32 v2, s65 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xe0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xd0 ; GFX8-NEXT: v_mov_b32_e32 v0, s64 ; GFX8-NEXT: v_mov_b32_e32 v1, s63 ; GFX8-NEXT: v_mov_b32_e32 v2, s62 ; GFX8-NEXT: v_mov_b32_e32 v3, s61 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xd0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xc0 ; GFX8-NEXT: v_mov_b32_e32 v0, s60 ; GFX8-NEXT: v_mov_b32_e32 v1, s59 ; GFX8-NEXT: v_mov_b32_e32 v2, s58 ; GFX8-NEXT: v_mov_b32_e32 v3, s57 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xc0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xb0 ; GFX8-NEXT: v_mov_b32_e32 v0, s56 ; GFX8-NEXT: v_mov_b32_e32 v1, s55 ; GFX8-NEXT: v_mov_b32_e32 v2, s54 ; GFX8-NEXT: v_mov_b32_e32 v3, s53 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xb0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0xa0 ; GFX8-NEXT: v_mov_b32_e32 v0, s52 ; GFX8-NEXT: v_mov_b32_e32 v1, s51 ; GFX8-NEXT: v_mov_b32_e32 v2, s50 ; GFX8-NEXT: v_mov_b32_e32 v3, s49 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0xa0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0x90 ; GFX8-NEXT: v_mov_b32_e32 v0, s48 ; GFX8-NEXT: v_mov_b32_e32 v1, s47 ; GFX8-NEXT: v_mov_b32_e32 v2, s46 ; GFX8-NEXT: v_mov_b32_e32 v3, s45 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0x90 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0x80 ; GFX8-NEXT: v_mov_b32_e32 v0, s44 ; GFX8-NEXT: v_mov_b32_e32 v1, s43 ; GFX8-NEXT: v_mov_b32_e32 v2, s42 ; GFX8-NEXT: v_mov_b32_e32 v3, s41 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0x80 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v0, s40 ; GFX8-NEXT: v_mov_b32_e32 v1, s39 ; GFX8-NEXT: v_mov_b32_e32 v2, s38 ; GFX8-NEXT: v_mov_b32_e32 v3, s37 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0x70 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_add_u32 s26, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v0, s36 ; GFX8-NEXT: v_mov_b32_e32 v1, s35 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s33 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 -; GFX8-NEXT: s_add_u32 s26, s0, 0x60 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s27, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s26 ; GFX8-NEXT: v_mov_b32_e32 v0, s31 ; GFX8-NEXT: v_mov_b32_e32 v1, s30 @@ -4364,9 +4364,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s22, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v2, s23 ; GFX8-NEXT: s_addc_u32 s23, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NEXT: v_mov_b32_e32 v0, s25 ; GFX8-NEXT: v_mov_b32_e32 v1, s24 +; GFX8-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NEXT: v_mov_b32_e32 v5, s23 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -4374,9 +4374,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s18, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v2, s19 ; GFX8-NEXT: s_addc_u32 s19, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NEXT: v_mov_b32_e32 v0, s21 ; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NEXT: v_mov_b32_e32 v5, s19 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -4384,9 +4384,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s14, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v2, s15 ; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NEXT: v_mov_b32_e32 v0, s17 ; GFX8-NEXT: v_mov_b32_e32 v1, s16 +; GFX8-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NEXT: v_mov_b32_e32 v5, s15 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -4394,9 +4394,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s10, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v2, s11 ; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NEXT: v_mov_b32_e32 v0, s13 ; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -4404,9 +4404,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s6, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: v_mov_b32_e32 v0, s9 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 @@ -5072,8 +5072,9 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm ; @@ -5238,8 +5239,9 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm ; @@ -5500,9 +5502,9 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NEXT: v_mov_b32_e32 v7, s1 ; GFX8-NEXT: v_mov_b32_e32 v8, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v7, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v0, 1, v4 @@ -5624,8 +5626,8 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 2, v0 @@ -5756,14 +5758,14 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, s3 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, s1 +; GFX8-NEXT: v_mov_b32_e32 v11, s3 ; GFX8-NEXT: v_mov_b32_e32 v10, s2 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: v_mov_b32_e32 v8, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v4, 1, v0 @@ -5903,8 +5905,8 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v11, s3 -; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: v_mov_b32_e32 v10, s2 +; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: v_mov_b32_e32 v8, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 2, v0 @@ -6057,11 +6059,11 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: s_add_u32 s4, s0, 32 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v16, s5 +; GFX8-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v16, s5 ; GFX8-NEXT: v_mov_b32_e32 v15, s4 ; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v10, v1 @@ -6072,16 +6074,16 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX8-NEXT: v_bfe_u32 v6, v0, 5, 1 ; GFX8-NEXT: v_bfe_u32 v4, v0, 4, 1 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[4:7] -; GFX8-NEXT: v_mov_b32_e32 v16, s3 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_bfe_u32 v9, v0, 3, 1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: s_add_u32 s0, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v18, s1 ; GFX8-NEXT: v_mov_b32_e32 v17, s0 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX8-NEXT: v_bfe_u32 v9, v0, 3, 1 ; GFX8-NEXT: v_bfe_u32 v7, v0, 2, 1 +; GFX8-NEXT: v_mov_b32_e32 v16, s3 ; GFX8-NEXT: v_mov_b32_e32 v15, s2 ; GFX8-NEXT: v_bfe_u32 v13, v0, 1, 1 ; GFX8-NEXT: v_and_b32_e32 v11, 1, v0 @@ -6253,8 +6255,8 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: v_mov_b32_e32 v16, s0 +; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s3, v0 ; GFX8-NEXT: s_lshr_b32 s2, s3, 6 @@ -6276,28 +6278,28 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, s3 ; GFX8-NEXT: v_mov_b32_e32 v18, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 +; GFX8-NEXT: v_mov_b32_e32 v19, s3 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v6, s4 ; GFX8-NEXT: v_mov_b32_e32 v7, s5 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[4:7] ; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v8, s6 ; GFX8-NEXT: v_mov_b32_e32 v9, s7 ; GFX8-NEXT: v_mov_b32_e32 v10, s8 ; GFX8-NEXT: v_mov_b32_e32 v11, s9 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[4:7] ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[8:11] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v12, s10 ; GFX8-NEXT: v_mov_b32_e32 v13, s11 ; GFX8-NEXT: v_mov_b32_e32 v14, s12 ; GFX8-NEXT: v_mov_b32_e32 v15, s13 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[8:11] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s14 @@ -6542,43 +6544,43 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, s3 ; GFX8-NEXT: v_mov_b32_e32 v10, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 +; GFX8-NEXT: v_mov_b32_e32 v11, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[2:5] -; GFX8-NEXT: v_mov_b32_e32 v11, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v10, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v11, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x60 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[6:9] +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[6:9] ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v2, s13 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v2, s11 ; GFX8-NEXT: v_mov_b32_e32 v4, v12 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 @@ -6862,8 +6864,8 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v19, s1 ; GFX8-NEXT: v_mov_b32_e32 v18, s0 +; GFX8-NEXT: v_mov_b32_e32 v19, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s3, v0 ; GFX8-NEXT: s_lshr_b32 s2, s3, 14 @@ -6901,64 +6903,64 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[2:5] -; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: v_mov_b32_e32 v7, s7 ; GFX8-NEXT: v_mov_b32_e32 v8, s8 ; GFX8-NEXT: v_mov_b32_e32 v9, s9 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[6:9] -; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v10, s10 ; GFX8-NEXT: v_mov_b32_e32 v11, s11 ; GFX8-NEXT: v_mov_b32_e32 v12, s12 ; GFX8-NEXT: v_mov_b32_e32 v13, s13 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NEXT: v_mov_b32_e32 v3, s15 ; GFX8-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NEXT: v_mov_b32_e32 v5, s17 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[2:5] ; GFX8-NEXT: v_mov_b32_e32 v6, s18 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v7, s19 ; GFX8-NEXT: v_mov_b32_e32 v8, s20 ; GFX8-NEXT: v_mov_b32_e32 v9, s21 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[6:9] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v10, s22 ; GFX8-NEXT: v_mov_b32_e32 v11, s23 ; GFX8-NEXT: v_mov_b32_e32 v12, s24 ; GFX8-NEXT: v_mov_b32_e32 v13, s25 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[6:9] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[10:13] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v14, s26 ; GFX8-NEXT: v_mov_b32_e32 v15, s27 ; GFX8-NEXT: v_mov_b32_e32 v16, s28 ; GFX8-NEXT: v_mov_b32_e32 v17, s29 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[10:13] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s30 @@ -7365,94 +7367,94 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NEXT: s_addc_u32 s7, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: s_add_u32 s6, s0, 0xe0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0xd0 ; GFX8-NEXT: v_mov_b32_e32 v0, s34 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0xd0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0xc0 ; GFX8-NEXT: v_mov_b32_e32 v0, s33 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0xc0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0xb0 ; GFX8-NEXT: v_mov_b32_e32 v0, s31 ; GFX8-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0xb0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0xa0 ; GFX8-NEXT: v_mov_b32_e32 v0, s30 ; GFX8-NEXT: v_mov_b32_e32 v2, s11 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0xa0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0x90 ; GFX8-NEXT: v_mov_b32_e32 v0, s28 ; GFX8-NEXT: v_mov_b32_e32 v2, s29 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0x90 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0x80 ; GFX8-NEXT: v_mov_b32_e32 v0, s27 ; GFX8-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0x80 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v0, s26 ; GFX8-NEXT: v_mov_b32_e32 v2, s13 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0x70 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v0, s25 ; GFX8-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0x60 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NEXT: v_mov_b32_e32 v2, s15 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 0x50 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v0, s23 ; GFX8-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 64 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v0, s22 ; GFX8-NEXT: v_mov_b32_e32 v2, s17 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 48 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v0, s21 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_add_u32 s6, s0, 32 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: v_mov_b32_e32 v2, s19 @@ -8071,48 +8073,48 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v1, s45 ; GFX8-NEXT: s_addc_u32 s45, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: s_add_u32 s44, s0, 0xe0 ; GFX8-NEXT: v_mov_b32_e32 v2, s46 ; GFX8-NEXT: v_mov_b32_e32 v3, s47 ; GFX8-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NEXT: s_add_u32 s44, s0, 0xe0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s45, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: s_add_u32 s44, s0, 0xd0 ; GFX8-NEXT: v_mov_b32_e32 v0, s48 ; GFX8-NEXT: v_mov_b32_e32 v1, s49 ; GFX8-NEXT: v_mov_b32_e32 v2, s50 ; GFX8-NEXT: v_mov_b32_e32 v3, s51 ; GFX8-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NEXT: s_add_u32 s44, s0, 0xd0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s45, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: s_add_u32 s44, s0, 0xc0 ; GFX8-NEXT: v_mov_b32_e32 v0, s52 ; GFX8-NEXT: v_mov_b32_e32 v1, s53 ; GFX8-NEXT: v_mov_b32_e32 v2, s54 ; GFX8-NEXT: v_mov_b32_e32 v3, s55 ; GFX8-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NEXT: s_add_u32 s44, s0, 0xc0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s45, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: s_add_u32 s44, s0, 0xb0 ; GFX8-NEXT: v_mov_b32_e32 v0, s56 ; GFX8-NEXT: v_mov_b32_e32 v1, s57 ; GFX8-NEXT: v_mov_b32_e32 v2, s58 ; GFX8-NEXT: v_mov_b32_e32 v3, s59 ; GFX8-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NEXT: s_add_u32 s44, s0, 0xb0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s45, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: s_add_u32 s44, s0, 0xa0 ; GFX8-NEXT: v_mov_b32_e32 v0, s60 ; GFX8-NEXT: v_mov_b32_e32 v1, s61 ; GFX8-NEXT: v_mov_b32_e32 v2, s62 ; GFX8-NEXT: v_mov_b32_e32 v3, s63 ; GFX8-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NEXT: s_add_u32 s44, s0, 0xa0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s45, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s44 ; GFX8-NEXT: v_mov_b32_e32 v0, s64 ; GFX8-NEXT: v_mov_b32_e32 v1, s65 @@ -8125,9 +8127,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s40, s0, 0x90 ; GFX8-NEXT: v_mov_b32_e32 v3, s41 ; GFX8-NEXT: s_addc_u32 s41, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s40 ; GFX8-NEXT: v_mov_b32_e32 v0, s42 ; GFX8-NEXT: v_mov_b32_e32 v1, s43 +; GFX8-NEXT: v_mov_b32_e32 v4, s40 ; GFX8-NEXT: v_mov_b32_e32 v5, s41 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -8135,9 +8137,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s36, s0, 0x80 ; GFX8-NEXT: v_mov_b32_e32 v3, s37 ; GFX8-NEXT: s_addc_u32 s37, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s36 ; GFX8-NEXT: v_mov_b32_e32 v0, s38 ; GFX8-NEXT: v_mov_b32_e32 v1, s39 +; GFX8-NEXT: v_mov_b32_e32 v4, s36 ; GFX8-NEXT: v_mov_b32_e32 v5, s37 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -8145,9 +8147,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s30, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v3, s31 ; GFX8-NEXT: s_addc_u32 s31, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s30 ; GFX8-NEXT: v_mov_b32_e32 v0, s34 ; GFX8-NEXT: v_mov_b32_e32 v1, s35 +; GFX8-NEXT: v_mov_b32_e32 v4, s30 ; GFX8-NEXT: v_mov_b32_e32 v5, s31 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -8155,9 +8157,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s26, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v3, s27 ; GFX8-NEXT: s_addc_u32 s27, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s26 ; GFX8-NEXT: v_mov_b32_e32 v0, s28 ; GFX8-NEXT: v_mov_b32_e32 v1, s29 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -8165,9 +8167,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s22, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v3, s23 ; GFX8-NEXT: s_addc_u32 s23, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NEXT: v_mov_b32_e32 v1, s25 +; GFX8-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NEXT: v_mov_b32_e32 v5, s23 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -8175,9 +8177,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s18, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v3, s19 ; GFX8-NEXT: s_addc_u32 s19, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: v_mov_b32_e32 v1, s21 +; GFX8-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NEXT: v_mov_b32_e32 v5, s19 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -8185,9 +8187,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s14, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v3, s15 ; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NEXT: v_mov_b32_e32 v5, s15 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -8195,9 +8197,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s10, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -8205,9 +8207,9 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s6, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 @@ -8956,112 +8958,110 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v0, s43 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s42 -; GFX8-NEXT: v_mov_b32_e32 v5, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x1f0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s43 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x1e0 ; GFX8-NEXT: v_mov_b32_e32 v0, s66 ; GFX8-NEXT: v_mov_b32_e32 v2, s44 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x1e0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x1d0 ; GFX8-NEXT: v_mov_b32_e32 v0, s65 ; GFX8-NEXT: v_mov_b32_e32 v2, s45 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x1d0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x1c0 ; GFX8-NEXT: v_mov_b32_e32 v0, s64 ; GFX8-NEXT: v_mov_b32_e32 v2, s46 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x1c0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x1b0 ; GFX8-NEXT: v_mov_b32_e32 v0, s63 ; GFX8-NEXT: v_mov_b32_e32 v2, s47 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x1b0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x190 ; GFX8-NEXT: v_mov_b32_e32 v0, s62 ; GFX8-NEXT: v_mov_b32_e32 v2, s48 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x190 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x180 ; GFX8-NEXT: v_mov_b32_e32 v0, s61 ; GFX8-NEXT: v_mov_b32_e32 v2, s49 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x180 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x170 ; GFX8-NEXT: v_mov_b32_e32 v0, s60 ; GFX8-NEXT: v_mov_b32_e32 v2, s50 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x170 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x160 ; GFX8-NEXT: v_mov_b32_e32 v0, s59 ; GFX8-NEXT: v_mov_b32_e32 v2, s51 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x160 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x150 ; GFX8-NEXT: v_mov_b32_e32 v0, s58 ; GFX8-NEXT: v_mov_b32_e32 v2, s52 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x150 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x140 ; GFX8-NEXT: v_mov_b32_e32 v0, s57 ; GFX8-NEXT: v_mov_b32_e32 v2, s53 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x140 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x130 ; GFX8-NEXT: v_mov_b32_e32 v0, s56 ; GFX8-NEXT: v_mov_b32_e32 v2, s40 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x130 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 +; GFX8-NEXT: s_add_u32 s42, s0, 0x120 ; GFX8-NEXT: v_mov_b32_e32 v0, s55 ; GFX8-NEXT: v_mov_b32_e32 v2, s38 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x120 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s42 ; GFX8-NEXT: v_mov_b32_e32 v0, s54 ; GFX8-NEXT: v_mov_b32_e32 v2, s37 ; GFX8-NEXT: v_mov_b32_e32 v5, s43 ; GFX8-NEXT: s_add_u32 s40, s0, 0x110 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s40 ; GFX8-NEXT: v_mov_b32_e32 v0, s41 ; GFX8-NEXT: s_addc_u32 s41, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s40 ; GFX8-NEXT: v_mov_b32_e32 v2, s35 ; GFX8-NEXT: v_mov_b32_e32 v5, s41 ; GFX8-NEXT: s_add_u32 s38, s0, 0x100 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s38 ; GFX8-NEXT: v_mov_b32_e32 v0, s39 ; GFX8-NEXT: s_addc_u32 s39, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s38 ; GFX8-NEXT: v_mov_b32_e32 v2, s33 ; GFX8-NEXT: v_mov_b32_e32 v5, s39 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -9069,52 +9069,48 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v0, s36 ; GFX8-NEXT: s_add_u32 s36, s0, 0xf0 ; GFX8-NEXT: s_addc_u32 s37, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s36 ; GFX8-NEXT: v_mov_b32_e32 v2, s30 +; GFX8-NEXT: v_mov_b32_e32 v4, s36 ; GFX8-NEXT: v_mov_b32_e32 v5, s37 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s34 ; GFX8-NEXT: s_add_u32 s34, s0, 0xe0 ; GFX8-NEXT: s_addc_u32 s35, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s34 ; GFX8-NEXT: v_mov_b32_e32 v2, s28 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 ; GFX8-NEXT: v_mov_b32_e32 v5, s35 ; GFX8-NEXT: s_add_u32 s30, s0, 0xd0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s30 ; GFX8-NEXT: v_mov_b32_e32 v0, s31 ; GFX8-NEXT: s_addc_u32 s31, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s30 ; GFX8-NEXT: v_mov_b32_e32 v2, s26 ; GFX8-NEXT: v_mov_b32_e32 v5, s31 ; GFX8-NEXT: s_add_u32 s28, s0, 0xc0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s28 ; GFX8-NEXT: v_mov_b32_e32 v0, s29 ; GFX8-NEXT: s_addc_u32 s29, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s28 ; GFX8-NEXT: v_mov_b32_e32 v2, s25 ; GFX8-NEXT: v_mov_b32_e32 v5, s29 ; GFX8-NEXT: s_add_u32 s26, s0, 0xb0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 ; GFX8-NEXT: v_mov_b32_e32 v0, s27 ; GFX8-NEXT: s_addc_u32 s27, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s26 ; GFX8-NEXT: v_mov_b32_e32 v2, s22 ; GFX8-NEXT: v_mov_b32_e32 v5, s27 ; GFX8-NEXT: s_add_u32 s22, s0, 0xa0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NEXT: v_mov_b32_e32 v0, s23 ; GFX8-NEXT: s_addc_u32 s23, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s22 +; GFX8-NEXT: s_add_u32 s22, s0, 0x90 ; GFX8-NEXT: v_mov_b32_e32 v2, s24 ; GFX8-NEXT: v_mov_b32_e32 v5, s23 -; GFX8-NEXT: s_add_u32 s22, s0, 0x90 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s23, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NEXT: v_mov_b32_e32 v0, s21 ; GFX8-NEXT: v_mov_b32_e32 v2, s19 @@ -9124,32 +9120,32 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: s_add_u32 s20, s0, 0x80 ; GFX8-NEXT: s_addc_u32 s21, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NEXT: v_mov_b32_e32 v5, s21 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s18 ; GFX8-NEXT: s_add_u32 s18, s0, 0x70 ; GFX8-NEXT: s_addc_u32 s19, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NEXT: v_mov_b32_e32 v2, s15 +; GFX8-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NEXT: v_mov_b32_e32 v5, s19 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: s_add_u32 s16, s0, 0x60 ; GFX8-NEXT: s_addc_u32 s17, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NEXT: v_mov_b32_e32 v5, s17 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NEXT: s_add_u32 s12, s0, 0x50 ; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -10400,70 +10396,70 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[6:7], s[2:3], 0x10000 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1f0 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v43, s3 ; GFX8-NEXT: v_mov_b32_e32 v42, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1e0 +; GFX8-NEXT: v_mov_b32_e32 v43, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v45, s3 ; GFX8-NEXT: v_mov_b32_e32 v44, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1d0 +; GFX8-NEXT: v_mov_b32_e32 v45, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v47, s3 ; GFX8-NEXT: v_mov_b32_e32 v46, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1c0 +; GFX8-NEXT: v_mov_b32_e32 v47, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v49, s3 ; GFX8-NEXT: v_mov_b32_e32 v48, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1b0 +; GFX8-NEXT: v_mov_b32_e32 v49, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v51, s3 ; GFX8-NEXT: v_mov_b32_e32 v50, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1a0 +; GFX8-NEXT: v_mov_b32_e32 v51, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v53, s3 ; GFX8-NEXT: v_mov_b32_e32 v52, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x190 +; GFX8-NEXT: v_mov_b32_e32 v53, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v55, s3 ; GFX8-NEXT: v_mov_b32_e32 v54, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x180 +; GFX8-NEXT: v_mov_b32_e32 v55, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v57, s3 ; GFX8-NEXT: v_mov_b32_e32 v56, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x170 +; GFX8-NEXT: v_mov_b32_e32 v57, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v59, s3 ; GFX8-NEXT: v_mov_b32_e32 v58, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x160 +; GFX8-NEXT: v_mov_b32_e32 v59, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v61, s3 ; GFX8-NEXT: v_mov_b32_e32 v60, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x150 +; GFX8-NEXT: v_mov_b32_e32 v61, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[12:15] ; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11] ; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19] -; GFX8-NEXT: v_mov_b32_e32 v13, s3 ; GFX8-NEXT: v_mov_b32_e32 v12, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x140 +; GFX8-NEXT: v_mov_b32_e32 v13, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x130 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v17, s3 ; GFX8-NEXT: v_mov_b32_e32 v16, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x120 +; GFX8-NEXT: v_mov_b32_e32 v17, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, s3 ; GFX8-NEXT: v_mov_b32_e32 v18, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x110 ; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v19, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v42, vcc_lo ; GFX8-NEXT: v_mov_b32_e32 v43, vcc_hi +; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: v_mov_b32_e32 v6, s4 ; GFX8-NEXT: v_mov_b32_e32 v7, s5 ; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3] @@ -10482,159 +10478,159 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x100 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xf0 ; GFX8-NEXT: v_mov_b32_e32 v0, s22 ; GFX8-NEXT: v_mov_b32_e32 v1, s23 ; GFX8-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NEXT: v_mov_b32_e32 v3, s21 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 ; GFX8-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NEXT: v_mov_b32_e32 v1, s25 ; GFX8-NEXT: v_mov_b32_e32 v2, s26 ; GFX8-NEXT: v_mov_b32_e32 v3, s27 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xd0 ; GFX8-NEXT: v_mov_b32_e32 v0, s28 ; GFX8-NEXT: v_mov_b32_e32 v1, s29 ; GFX8-NEXT: v_mov_b32_e32 v2, s86 ; GFX8-NEXT: v_mov_b32_e32 v3, s87 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xc0 ; GFX8-NEXT: v_mov_b32_e32 v0, s84 ; GFX8-NEXT: v_mov_b32_e32 v1, s85 ; GFX8-NEXT: v_mov_b32_e32 v2, s82 ; GFX8-NEXT: v_mov_b32_e32 v3, s83 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xb0 ; GFX8-NEXT: v_mov_b32_e32 v0, s80 ; GFX8-NEXT: v_mov_b32_e32 v1, s81 ; GFX8-NEXT: v_mov_b32_e32 v2, s78 ; GFX8-NEXT: v_mov_b32_e32 v3, s79 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xa0 ; GFX8-NEXT: v_mov_b32_e32 v0, s76 ; GFX8-NEXT: v_mov_b32_e32 v1, s77 ; GFX8-NEXT: v_mov_b32_e32 v2, s74 ; GFX8-NEXT: v_mov_b32_e32 v3, s75 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x90 ; GFX8-NEXT: v_mov_b32_e32 v0, s72 ; GFX8-NEXT: v_mov_b32_e32 v1, s73 ; GFX8-NEXT: v_mov_b32_e32 v2, s70 ; GFX8-NEXT: v_mov_b32_e32 v3, s71 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x80 ; GFX8-NEXT: v_mov_b32_e32 v0, s68 ; GFX8-NEXT: v_mov_b32_e32 v1, s69 ; GFX8-NEXT: v_mov_b32_e32 v2, s66 ; GFX8-NEXT: v_mov_b32_e32 v3, s67 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v0, s64 ; GFX8-NEXT: v_mov_b32_e32 v1, s65 ; GFX8-NEXT: v_mov_b32_e32 v2, s62 ; GFX8-NEXT: v_mov_b32_e32 v3, s63 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v0, s60 ; GFX8-NEXT: v_mov_b32_e32 v1, s61 ; GFX8-NEXT: v_mov_b32_e32 v2, s58 ; GFX8-NEXT: v_mov_b32_e32 v3, s59 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v0, s56 ; GFX8-NEXT: v_mov_b32_e32 v1, s57 ; GFX8-NEXT: v_mov_b32_e32 v2, s54 ; GFX8-NEXT: v_mov_b32_e32 v3, s55 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v0, s52 ; GFX8-NEXT: v_mov_b32_e32 v1, s53 ; GFX8-NEXT: v_mov_b32_e32 v2, s50 ; GFX8-NEXT: v_mov_b32_e32 v3, s51 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v0, s48 ; GFX8-NEXT: v_mov_b32_e32 v1, s49 ; GFX8-NEXT: v_mov_b32_e32 v2, s46 ; GFX8-NEXT: v_mov_b32_e32 v3, s47 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v0, s44 ; GFX8-NEXT: v_mov_b32_e32 v1, s45 ; GFX8-NEXT: v_mov_b32_e32 v2, s42 ; GFX8-NEXT: v_mov_b32_e32 v3, s43 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s40 ; GFX8-NEXT: v_mov_b32_e32 v1, s41 ; GFX8-NEXT: v_mov_b32_e32 v2, s38 ; GFX8-NEXT: v_mov_b32_e32 v3, s39 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_readlane_b32 s2, v62, 4 ; GFX8-NEXT: v_mov_b32_e32 v0, s36 ; GFX8-NEXT: v_mov_b32_e32 v1, s37 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: v_readlane_b32 s2, v62, 4 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_readlane_b32 s3, v62, 5 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s30 ; GFX8-NEXT: v_mov_b32_e32 v1, s31 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 0194e3c6ce37b..61fb7b9f0a3b1 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -6,6 +6,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s + define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_i16: ; GCN-NOHSA-SI: ; %bb.0: ; %entry @@ -202,10 +203,10 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; GCN-HSA-NEXT: s_add_u32 s4, s0, 4 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2 ; GCN-HSA-NEXT: flat_store_short v[2:3], v4 @@ -220,10 +221,10 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s0, 4 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s2 ; GCN-NOHSA-VI-NEXT: flat_store_short v[2:3], v4 @@ -348,9 +349,9 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -429,10 +430,10 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -480,12 +481,12 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_endpgm @@ -504,12 +505,12 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -613,8 +614,8 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -633,36 +634,37 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 14 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s3 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_load_ushort v16, v[0:1] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v17, v[2:3] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v18, v[4:5] @@ -671,38 +673,37 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GCN-NOHSA-VI-NEXT: flat_load_ushort v9, v[10:11] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v10, v[12:13] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v11, v[14:15] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s3 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_load_ushort v12, v[0:1] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v13, v[2:3] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v14, v[4:5] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v15, v[6:7] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s0 ; GCN-NOHSA-VI-NEXT: flat_load_ushort v0, v[0:1] ; GCN-NOHSA-VI-NEXT: flat_load_ushort v20, v[2:3] @@ -1802,9 +1803,9 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -1834,9 +1835,9 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 @@ -1957,16 +1958,16 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_ashr_i32 s3, s6, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 ; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 +; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 -; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 @@ -1989,16 +1990,16 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s3, s6, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 @@ -2151,20 +2152,20 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -2209,20 +2210,20 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 @@ -2408,27 +2409,29 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 ; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 ; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 +; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 -; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 +; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 +; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 @@ -2436,8 +2439,6 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 -; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 @@ -2466,27 +2467,29 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s8 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 @@ -2494,8 +2497,6 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 @@ -2741,58 +2742,58 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s31 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s30 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -2853,56 +2854,56 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 @@ -3211,73 +3212,73 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 ; GCN-HSA-NEXT: s_sext_i32_i16 s20, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 ; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 ; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 +; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 ; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 ; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s26 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 ; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -3323,71 +3324,71 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s15 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s13, s13 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s12 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s30 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s8 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s28 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s26 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s3, s3 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s3, s3 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 @@ -3830,25 +3831,25 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s62 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 @@ -3856,20 +3857,19 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s60 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s66 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s65 @@ -3880,14 +3880,14 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s64 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s63 @@ -3903,8 +3903,9 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s68 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s67 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 @@ -3920,40 +3921,40 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s41 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -4047,128 +4048,128 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xf0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xe0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xd0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xc0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xb0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xa0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x90 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s56 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x80 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s54 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s52 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 @@ -4735,56 +4736,56 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xf0 ; GCN-HSA-NEXT: s_sext_i32_i16 s55, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 ; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] ; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] ; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] +; GCN-HSA-NEXT: s_sext_i32_i16 s0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x80 -; GCN-HSA-NEXT: s_sext_i32_i16 s0, s0 ; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 ; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s1, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 @@ -4805,11 +4806,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] ; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 @@ -4821,9 +4821,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s54 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s55 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 @@ -4839,40 +4840,40 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -4936,157 +4937,159 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xf0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s15 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xe0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s13, s13 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s12 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xd0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xc0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s8 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xb0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xa0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x90 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s3, s3 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x80 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s56 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s52 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s53 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s31, s31 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s30, s30 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s29, s29 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s28, s28 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s27, s27 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s26, s26 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s25, s25 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s24, s24 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s23, s23 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s22, s22 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s21, s21 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s20, s20 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s19, s19 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s18, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s19, s19 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s18, s18 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s17, s17 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s16, s16 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 @@ -5094,8 +5097,6 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s17, s17 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s16, s16 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 @@ -6078,8 +6079,8 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6219,9 +6220,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6250,9 +6251,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 @@ -6382,16 +6383,16 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6426,16 +6427,16 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 @@ -6609,20 +6610,20 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6666,21 +6667,21 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 -; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 32 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 -; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 16 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9 @@ -6879,44 +6880,44 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6959,44 +6960,44 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 @@ -7285,23 +7286,23 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s34 -; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s7, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: s_addc_u32 s7, s9, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -7310,30 +7311,30 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s8, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s8, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s8, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s8, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s8, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s8, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 @@ -7389,30 +7390,30 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 64 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x70 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x60 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s9, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 @@ -7425,20 +7426,20 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s8, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s9, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s8, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s9, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s8, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s9, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 @@ -7768,96 +7769,96 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xe0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xe0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xc0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xc0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xa0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xa0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x80 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 @@ -7867,8 +7868,8 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s16, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 @@ -7923,87 +7924,87 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xe0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xd0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s35 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xd0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xc0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xc0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xb0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s33 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xb0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xa0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xa0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x90 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x90 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x80 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x80 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x70 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x60 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x50 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 64 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 48 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 @@ -8013,10 +8014,10 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 32 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 @@ -8533,18 +8534,18 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47 ; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s47 ; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s47 ; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s47 ; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s47 ; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47 ; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s54 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s65 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s63 ; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 @@ -8562,11 +8563,11 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27 ; GCN-HSA-NEXT: s_addc_u32 s27, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 ; GCN-HSA-NEXT: s_add_u32 s26, s16, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s49 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s61 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 ; GCN-HSA-NEXT: s_addc_u32 s27, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s72 @@ -8577,38 +8578,38 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s51 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 ; GCN-HSA-NEXT: s_addc_u32 s25, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s46 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 ; GCN-HSA-NEXT: s_add_u32 s14, s16, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 -; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s70 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s71 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s57 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s44 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 +; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s28 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s29 +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s24 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s14 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] @@ -8618,9 +8619,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s12, s16, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -8628,9 +8629,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s10, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -8638,9 +8639,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s8, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -8648,9 +8649,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s6, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -8658,9 +8659,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s4, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 @@ -8738,9 +8739,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s60, s16, 0xf0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s61 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s61, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s62 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s61 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 @@ -8748,9 +8749,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s58, s16, 0xe0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s59, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s72 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s73 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s59 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 @@ -8758,9 +8759,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s54, s16, 0xd0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s55, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s54 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s56 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s54 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s55 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 @@ -8768,9 +8769,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s52, s16, 0xc0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s53, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s52 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s65 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s52 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s53 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 @@ -8778,9 +8779,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s48, s16, 0xb0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s49, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s49 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 @@ -8789,12 +8790,12 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s39, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s38 +; GCN-NOHSA-VI-NEXT: s_add_u32 s38, s16, 0x90 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s46 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s39 -; GCN-NOHSA-VI-NEXT: s_add_u32 s38, s16, 0x90 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s39, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s38 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 @@ -8808,12 +8809,12 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s25, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s24 +; GCN-NOHSA-VI-NEXT: s_add_u32 s24, s16, 0x70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s40 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s25 -; GCN-NOHSA-VI-NEXT: s_add_u32 s24, s16, 0x70 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s25, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 @@ -8827,12 +8828,12 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s21, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NOHSA-VI-NEXT: s_add_u32 s20, s16, 0x50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s21 -; GCN-NOHSA-VI-NEXT: s_add_u32 s20, s16, 0x50 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s21, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 @@ -8845,11 +8846,11 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 @@ -8864,11 +8865,11 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index 9d3a9f1dff8e8..0aabc9af5aa85 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -177,9 +177,9 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm ; @@ -394,10 +394,10 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm ; @@ -458,12 +458,12 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-HSA-NEXT: s_endpgm @@ -482,12 +482,12 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NOHSA-NEXT: s_endpgm @@ -609,15 +609,15 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: s_add_u32 s4, s8, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-HSA-NEXT: flat_store_dword v[4:5], v6 ; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: flat_store_dword v[4:5], v6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 @@ -640,15 +640,15 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NOHSA-NEXT: flat_store_dword v[4:5], v6 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NOHSA-NEXT: flat_store_dword v[4:5], v6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 @@ -791,18 +791,18 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: s_add_u32 s10, s8, 32 ; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s13 +; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: s_add_u32 s4, s8, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-HSA-NEXT: flat_store_dwordx2 v[4:5], v[6:7] ; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: flat_store_dwordx2 v[4:5], v[6:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 @@ -823,18 +823,18 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; GFX8-NOHSA-NEXT: s_add_u32 s10, s8, 32 ; GFX8-NOHSA-NEXT: s_addc_u32 s11, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[4:5], v[6:7] ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[4:5], v[6:7] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 @@ -903,13 +903,13 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[12:13], s[10:11], 0x20 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v10, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v9, s13 ; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v0, s4 -; GFX12-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_mov_b32 v2, s6 -; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 -; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s3 +; GFX12-NEXT: v_mov_b32_e32 v6, s2 ; GFX12-NEXT: s_clause 0x2 ; GFX12-NEXT: global_store_b64 v10, v[8:9], s[8:9] offset:32 ; GFX12-NEXT: global_store_b128 v10, v[0:3], s[8:9] offset:16 @@ -991,19 +991,19 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-HSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; GFX7-HSA-NEXT: s_endpgm @@ -1026,17 +1026,17 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; GFX8-NOHSA-NEXT: s_endpgm @@ -1184,27 +1184,28 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: s_add_u32 s10, s8, 32 ; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_add_u32 s4, s8, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s8, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s7 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -1218,27 +1219,28 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; GFX8-NOHSA-NEXT: s_add_u32 s10, s8, 32 ; GFX8-NOHSA-NEXT: s_addc_u32 s11, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s7 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -1302,14 +1304,14 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[12:15], s[10:11], 0x20 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v12, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s13 ; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s15 -; GFX12-NEXT: v_dual_mov_b32 v1, s13 :: v_dual_mov_b32 v2, s14 -; GFX12-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4 -; GFX12-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6 -; GFX12-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0 -; GFX12-NEXT: v_dual_mov_b32 v11, s3 :: v_dual_mov_b32 v10, s2 +; GFX12-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7 +; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v9, s1 +; GFX12-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s3 +; GFX12-NEXT: v_mov_b32_e32 v10, s2 ; GFX12-NEXT: s_clause 0x2 ; GFX12-NEXT: global_store_b128 v12, v[0:3], s[8:9] offset:32 ; GFX12-NEXT: global_store_b128 v12, v[4:7], s[8:9] offset:16 @@ -1390,27 +1392,27 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s19 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 @@ -1432,27 +1434,27 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s19 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 @@ -2257,8 +2259,8 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 @@ -2415,16 +2417,16 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s2 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-HSA-NEXT: s_endpgm @@ -2443,16 +2445,16 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NOHSA-NEXT: s_endpgm @@ -2595,20 +2597,20 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -2838,9 +2840,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX7-HSA-NEXT: s_add_u32 s6, s8, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -2848,9 +2850,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX7-HSA-NEXT: s_add_u32 s4, s8, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -2858,9 +2860,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX7-HSA-NEXT: s_add_u32 s2, s8, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 @@ -2890,9 +2892,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -2900,9 +2902,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -2910,9 +2912,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 @@ -3176,12 +3178,12 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x70 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s19 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s19 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s19 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s19 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-HSA-NEXT: s_add_u32 s14, s16, 64 @@ -3189,19 +3191,19 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX7-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s33 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[13:14], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 48 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[15:16], v[3:6] +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[15:16], v[3:6] ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s18 @@ -3209,9 +3211,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s19 @@ -3219,23 +3221,23 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s14 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[17:18], v[6:9] ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 @@ -3271,9 +3273,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s14, s16, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX8-NOHSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s34 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s15 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -3281,9 +3283,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s13 ; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s30 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -3291,9 +3293,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s10, s16, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 ; GFX8-NOHSA-NEXT: s_addc_u32 s11, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -3301,9 +3303,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NOHSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s27 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s26 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -3311,9 +3313,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s25 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -3321,9 +3323,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -3331,9 +3333,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s2, s16, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 @@ -4187,55 +4189,55 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s35 ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s35 ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s35 ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xd0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s35 ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v36, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s34 ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xc0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[31:32], v[27:30] +; GFX7-HSA-NEXT: v_mov_b32_e32 v36, s35 ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[31:32], v[27:30] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[33:34], v[23:26] ; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s35 ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s35 ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s35 ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xa0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[33:34], v[23:26] +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s35 ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s35 ; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s35 ; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 ; GFX7-HSA-NEXT: s_add_u32 s28, s16, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX7-HSA-NEXT: s_addc_u32 s29, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s29 ; GFX7-HSA-NEXT: s_add_u32 s28, s16, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s29 ; GFX7-HSA-NEXT: s_addc_u32 s29, s17, 0 ; GFX7-HSA-NEXT: s_add_u32 s24, s16, 0x60 ; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s25 ; GFX7-HSA-NEXT: s_addc_u32 s25, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39 ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s34 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[35:36], v[8:11] ; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s28 ; GFX7-HSA-NEXT: s_add_u32 s24, s16, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s35 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[35:36], v[8:11] +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s29 ; GFX7-HSA-NEXT: s_addc_u32 s25, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[29:30], v[0:3] @@ -4249,7 +4251,6 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s51 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[20:23] ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s49 @@ -4257,16 +4258,17 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_add_u32 s14, s16, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s47 ; GFX7-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[3:6] ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s46 ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[29:30], v[6:9] ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9 @@ -4276,9 +4278,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s44 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -4286,9 +4288,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -4296,9 +4298,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 @@ -4353,9 +4355,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s30, s36, 0xf0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s31 ; GFX8-NOHSA-NEXT: s_addc_u32 s31, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s66 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s65 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s31 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4363,9 +4365,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s28, s36, 0xe0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s29 ; GFX8-NOHSA-NEXT: s_addc_u32 s29, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s28 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s63 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s28 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s29 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4373,9 +4375,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s26, s36, 0xd0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s27 ; GFX8-NOHSA-NEXT: s_addc_u32 s27, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s26 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s62 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s61 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s26 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s27 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4383,9 +4385,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s24, s36, 0xc0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX8-NOHSA-NEXT: s_addc_u32 s25, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s59 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s25 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4393,9 +4395,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s22, s36, 0xb0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s23 ; GFX8-NOHSA-NEXT: s_addc_u32 s23, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s58 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s57 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s22 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s23 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4403,9 +4405,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s20, s36, 0xa0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s21 ; GFX8-NOHSA-NEXT: s_addc_u32 s21, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s56 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s55 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s21 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4413,9 +4415,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s18, s36, 0x90 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19 ; GFX8-NOHSA-NEXT: s_addc_u32 s19, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s54 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s53 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s19 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4423,9 +4425,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s16, s36, 0x80 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s17 ; GFX8-NOHSA-NEXT: s_addc_u32 s17, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s52 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s17 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4433,9 +4435,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s14, s36, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX8-NOHSA-NEXT: s_addc_u32 s15, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s15 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4443,9 +4445,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s12, s36, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s13 ; GFX8-NOHSA-NEXT: s_addc_u32 s13, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s47 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4453,9 +4455,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s10, s36, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 ; GFX8-NOHSA-NEXT: s_addc_u32 s11, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s46 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4463,9 +4465,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s8, s36, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NOHSA-NEXT: s_addc_u32 s9, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s44 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4473,9 +4475,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s6, s36, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s42 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4483,9 +4485,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s4, s36, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s40 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s39 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -4493,9 +4495,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX8-NOHSA-NEXT: s_add_u32 s2, s36, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s38 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s36 @@ -5092,35 +5094,35 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xd0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0x90 -; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s1 +; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 @@ -5793,20 +5795,21 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x70 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s19 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s19 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 64 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] @@ -5815,9 +5818,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: s_add_u32 s12, s16, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -5825,9 +5828,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -5835,9 +5838,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 @@ -5869,18 +5872,18 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s25 ; GFX8-NOHSA-NEXT: s_addc_u32 s25, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s26 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s27 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s25 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NOHSA-NEXT: s_add_u32 s20, s36, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX8-NOHSA-NEXT: s_addc_u32 s21, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s23 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s21 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -5888,9 +5891,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX8-NOHSA-NEXT: s_add_u32 s16, s36, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NOHSA-NEXT: s_addc_u32 s17, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s17 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -5898,9 +5901,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX8-NOHSA-NEXT: s_add_u32 s12, s36, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NOHSA-NEXT: s_addc_u32 s13, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -5908,9 +5911,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX8-NOHSA-NEXT: s_add_u32 s8, s36, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NOHSA-NEXT: s_addc_u32 s9, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -5918,9 +5921,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX8-NOHSA-NEXT: s_add_u32 s4, s36, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s37, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s36 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll index 542b0ccedbf14..bbf73c7e9c892 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -69,9 +69,9 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %ld = load i64, ptr addrspace(4) %in @@ -149,10 +149,10 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -198,12 +198,12 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v5, s8 ; GFX7-NEXT: v_mov_b32_e32 v6, s9 -; GFX7-NEXT: flat_store_dwordx2 v[3:4], v[5:6] -; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: flat_store_dwordx2 v[3:4], v[5:6] ; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: s_endpgm @@ -221,12 +221,12 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, s8 ; GFX8-NEXT: v_mov_b32_e32 v6, s9 -; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[5:6] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[5:6] ; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -317,12 +317,12 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX7-NEXT: v_mov_b32_e32 v6, s2 ; GFX7-NEXT: v_mov_b32_e32 v7, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-NEXT: s_endpgm @@ -341,12 +341,12 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v6, s2 ; GFX8-NEXT: v_mov_b32_e32 v7, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm @@ -444,27 +444,27 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; GFX7-NEXT: v_mov_b32_e32 v6, s18 ; GFX7-NEXT: v_mov_b32_e32 v7, s19 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-NEXT: s_add_u32 s8, s16, 32 ; GFX7-NEXT: v_mov_b32_e32 v0, s12 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-NEXT: v_mov_b32_e32 v3, s15 -; GFX7-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-NEXT: s_add_u32 s8, s16, 32 ; GFX7-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX7-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX7-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-NEXT: v_mov_b32_e32 v7, s11 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_add_u32 s4, s16, 16 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: v_mov_b32_e32 v4, s16 @@ -486,27 +486,27 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; GFX8-NEXT: v_mov_b32_e32 v6, s18 ; GFX8-NEXT: v_mov_b32_e32 v7, s19 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: s_add_u32 s8, s16, 32 ; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: s_add_u32 s8, s16, 32 ; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX8-NEXT: s_addc_u32 s9, s17, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v6, s10 ; GFX8-NEXT: v_mov_b32_e32 v7, s11 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_add_u32 s4, s16, 16 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s16 @@ -680,20 +680,21 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_add_u32 s18, s16, 0x70 ; GFX7-NEXT: s_addc_u32 s19, s17, 0 ; GFX7-NEXT: v_mov_b32_e32 v16, s18 -; GFX7-NEXT: v_mov_b32_e32 v17, s19 ; GFX7-NEXT: s_add_u32 s18, s16, 0x60 -; GFX7-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX7-NEXT: v_mov_b32_e32 v17, s19 ; GFX7-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX7-NEXT: s_nop 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-NEXT: s_add_u32 s18, s16, 0x50 -; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-NEXT: s_add_u32 s18, s16, 64 -; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GFX7-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GFX7-NEXT: v_mov_b32_e32 v0, s18 ; GFX7-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[12:15] @@ -702,9 +703,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_add_u32 s12, s16, 48 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-NEXT: s_addc_u32 s13, s17, 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-NEXT: v_mov_b32_e32 v5, s13 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: s_nop 0 @@ -712,9 +713,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_add_u32 s8, s16, 32 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: s_nop 0 @@ -722,9 +723,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: s_add_u32 s4, s16, 16 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: v_mov_b32_e32 v4, s16 @@ -756,18 +757,18 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s25 ; GFX8-NEXT: s_addc_u32 s25, s37, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NEXT: v_mov_b32_e32 v6, s26 ; GFX8-NEXT: v_mov_b32_e32 v7, s27 +; GFX8-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NEXT: v_mov_b32_e32 v1, s25 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: s_add_u32 s20, s36, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v1, s21 ; GFX8-NEXT: s_addc_u32 s21, s37, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NEXT: v_mov_b32_e32 v2, s22 ; GFX8-NEXT: v_mov_b32_e32 v3, s23 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NEXT: v_mov_b32_e32 v5, s21 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -775,9 +776,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_add_u32 s16, s36, 64 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: s_addc_u32 s17, s37, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NEXT: v_mov_b32_e32 v5, s17 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -785,9 +786,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_add_u32 s12, s36, 48 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: s_addc_u32 s13, s37, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -795,9 +796,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_add_u32 s8, s36, 32 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: s_addc_u32 s9, s37, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -805,9 +806,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: s_add_u32 s4, s36, 16 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_addc_u32 s5, s37, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, s36 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index a71a5bbf95645..62d99adc1405c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -6,6 +6,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s + ; TODO: NOT AND define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_load_i8: @@ -228,14 +229,14 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: s_add_u32 s0, s0, 2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_lshr_b32 s0, s2, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-HSA-NEXT: flat_store_short v[0:1], v4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: flat_store_byte v[2:3], v0 @@ -247,14 +248,14 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NOHSA-NEXT: flat_store_short v[0:1], v4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NOHSA-NEXT: flat_store_byte v[2:3], v0 @@ -454,9 +455,9 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -535,10 +536,10 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -1554,9 +1555,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -1586,9 +1587,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 @@ -1716,9 +1717,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -1748,9 +1749,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 @@ -1909,20 +1910,20 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -1967,20 +1968,20 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s17 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 @@ -2174,24 +2175,25 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s6 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 +; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 @@ -2199,7 +2201,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 @@ -2232,24 +2233,25 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s7, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s6, s6 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5 @@ -2257,7 +2259,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 @@ -2509,58 +2510,58 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_u32 s11, s11, 0x80010 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s24 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -2621,56 +2622,56 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s33 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s25 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s31 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s29 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s28 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 @@ -2986,66 +2987,67 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_i32 s36, s11, 0x80008 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 ; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s33 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-HSA-NEXT: s_sext_i32_i8 s9, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s30 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GFX7-HSA-NEXT: s_sext_i32_i8 s9, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: s_sext_i32_i8 s8, s8 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s6 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 +; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 @@ -3053,7 +3055,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 @@ -3098,64 +3099,65 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s11, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s34 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s10, s10 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s33 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s31 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s30 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s9, s9 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s8, s8 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s26 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s24 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s7, s7 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s6, s6 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5 @@ -3163,7 +3165,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 @@ -3611,25 +3612,25 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_u32 s15, s15, 0x80010 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xd0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s66 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s54 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s13 @@ -3637,31 +3638,31 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s65 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x90 ; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s52 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s68 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s58 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s57 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x70 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s63 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s47 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 @@ -3677,30 +3678,30 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s55 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] ; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s46 ; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s59 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s44 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s45 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s42 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39 ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s33 @@ -3713,13 +3714,13 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 64 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_add_u32 s4, s16, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 48 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s26 @@ -3731,12 +3732,12 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s16, 32 ; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -3830,93 +3831,93 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xe0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s54 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s53 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xe0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xd0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s67 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s52 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xd0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xc0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s66 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xc0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xb0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s65 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s47 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xb0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xa0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s46 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xa0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x90 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s63 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s44 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s43 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x90 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x80 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s62 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s42 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x80 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s61 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s40 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s39 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x70 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s59 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s38 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s37 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x60 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s57 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s36 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s58 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x50 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s56 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s33 @@ -3929,13 +3930,13 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 64 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s55 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s28 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 48 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s26 @@ -3947,12 +3948,12 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s16, 32 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s16, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 @@ -4517,46 +4518,46 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xf0 ; GFX7-HSA-NEXT: s_sext_i32_i8 s50, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xd0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9 ; GFX7-HSA-NEXT: s_sext_i32_i8 s13, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s62 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s61 ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s60 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] ; GFX7-HSA-NEXT: s_sext_i32_i8 s12, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s9 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s59 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s58 ; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] ; GFX7-HSA-NEXT: s_sext_i32_i8 s15, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s9 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7 ; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 @@ -4574,49 +4575,49 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] -; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s7 +; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 +; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s65 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s53 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s52 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 +; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s50 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s49 ; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s46 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s47 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s45 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 -; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s44 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s43 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s35 @@ -4628,15 +4629,15 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 64 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_add_u32 s4, s16, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 48 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: s_sext_i32_i8 s2, s2 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s28 @@ -4647,13 +4648,14 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s16, 32 ; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GFX7-HSA-NEXT: s_sext_i32_i8 s0, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s1 @@ -4661,7 +4663,6 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_sext_i32_i8 s0, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -4730,107 +4731,107 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_bfe_i32 s66, s15, 0x80008 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xf0 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s15, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s15, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xe0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s66 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xe0 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s14, s14 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xd0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s65 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s63 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xd0 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s13, s13 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xc0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s62 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s61 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xc0 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s12, s12 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xb0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s59 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s58 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s57 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xb0 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s11, s11 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xa0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s56 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s55 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s54 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0xa0 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s10, s10 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x90 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s53 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s52 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x90 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s9, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x80 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s49 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x80 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s8, s8 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s47 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s46 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x70 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s44 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s43 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s42 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x60 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x50 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s40 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s39 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s38 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s37 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x50 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s36 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s35 @@ -4842,15 +4843,15 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 64 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s33 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s31 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 48 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28 @@ -4861,13 +4862,14 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s16, 32 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s16, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s26 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s17, 0 +; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1 @@ -4875,7 +4877,6 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 @@ -6067,9 +6068,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX7-HSA-NEXT: s_add_u32 s4, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6098,9 +6099,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6231,17 +6232,17 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6275,17 +6276,17 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6463,11 +6464,11 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6477,13 +6478,13 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6520,11 +6521,11 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6534,13 +6535,13 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6741,37 +6742,37 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6779,11 +6780,11 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6821,44 +6822,44 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s17 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 @@ -7159,22 +7160,23 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s6, s0, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 @@ -7182,29 +7184,28 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x60 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: s_add_u32 s6, s0, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s29 @@ -7262,21 +7263,21 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 64 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s35 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 16 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 @@ -7296,9 +7297,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s33 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -7306,9 +7307,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -7316,10 +7317,10 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s29 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -7656,96 +7657,96 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xd0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s7 ; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s21 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 48 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xe0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xe0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xc0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xc0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xa0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xa0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x80 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x80 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x60 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 64 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s29 @@ -7811,94 +7812,94 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xb0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x70 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s17 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 48 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xe0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xe0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xd0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s35 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xd0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xc0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xc0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xa0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s21 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0xa0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x90 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s33 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x90 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x80 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s23 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x80 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s31 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x60 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x50 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 64 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s29 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s27 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 32 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28 @@ -8443,75 +8444,75 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s51 ; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51 ; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51 ; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 ; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s38 ; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s63 ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s39 ; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s60 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s72 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX7-HSA-NEXT: s_add_u32 s34, s8, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s73 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s60 ; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s38 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 ; GFX7-HSA-NEXT: s_addc_u32 s35, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s58 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s59 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s22 +; GFX7-HSA-NEXT: s_add_u32 s22, s8, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s54 ; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s55 ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s56 ; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s38 ; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s39 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s22 -; GFX7-HSA-NEXT: s_add_u32 s22, s8, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s23 ; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s23 ; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xf0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s23 ; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s23 ; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xe0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s71 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s52 ; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s50 ; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s29 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s23 ; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s47 ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s23 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s21 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] @@ -8520,9 +8521,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0xb0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s46 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -8530,9 +8531,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s14, s8, 0xa0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX7-HSA-NEXT: s_addc_u32 s15, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s15 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -8540,9 +8541,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s12, s8, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: s_addc_u32 s13, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s44 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -8550,9 +8551,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s6, s8, 0x60 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -8560,9 +8561,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s4, s8, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s37 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -8570,9 +8571,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -8648,21 +8649,21 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s47 ; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xc0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 -; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xc0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0x90 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s68 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s69 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 -; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0x90 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s52 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s53 @@ -8676,12 +8677,12 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s39 ; GFX8-NOHSA-NEXT: s_addc_u32 s39, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s38 +; GFX8-NOHSA-NEXT: s_add_u32 s38, s8, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s56 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s57 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s39 -; GFX8-NOHSA-NEXT: s_add_u32 s38, s8, 0x50 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s39, s9, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s38 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s58 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s59 @@ -8695,12 +8696,12 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31 ; GFX8-NOHSA-NEXT: s_addc_u32 s31, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30 +; GFX8-NOHSA-NEXT: s_add_u32 s30, s8, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s40 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s31 -; GFX8-NOHSA-NEXT: s_add_u32 s30, s8, 16 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s31, s9, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s37 @@ -8721,12 +8722,12 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 0xe0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s65 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 0xe0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 @@ -8740,12 +8741,12 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 0xa0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s63 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 0xa0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 @@ -8758,11 +8759,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 0x60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s44 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 @@ -8777,11 +8778,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s42 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 @@ -10594,10 +10595,10 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -10650,10 +10651,10 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 @@ -10968,9 +10969,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -11041,9 +11042,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 @@ -11452,21 +11453,21 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 48 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -11558,21 +11559,21 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s21 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 @@ -12146,20 +12147,20 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -12287,20 +12288,20 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll index c119ef274bb04..7f26738eb0aac 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll @@ -260,16 +260,16 @@ define amdgpu_kernel void @global_load_v8f32(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 @@ -336,27 +336,27 @@ define amdgpu_kernel void @global_load_v9f32(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dword v14, v[6:7] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2) ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -426,27 +426,27 @@ define amdgpu_kernel void @global_load_v10f32(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: flat_load_dwordx2 v[8:9], v[8:9] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2) ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] @@ -517,27 +517,27 @@ define amdgpu_kernel void @global_load_v11f32(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: flat_load_dwordx3 v[8:10], v[8:9] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2) ; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[0:3] @@ -607,27 +607,27 @@ define amdgpu_kernel void @global_load_v12f32(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2) ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] @@ -702,16 +702,16 @@ define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s2, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 @@ -720,19 +720,19 @@ define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s5 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s4 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(2) ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 388006281abdc..1390377ad6499 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -5,6 +5,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=EG %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cayman < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=CM %s + ; FIXME: r600 is broken because the bigger testcases spill and it's not implemented define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { @@ -235,8 +236,8 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-NEXT: s_add_u32 s2, s0, 4 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: flat_store_short v[4:5], v1 @@ -543,18 +544,18 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] @@ -696,18 +697,18 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] @@ -1893,8 +1894,8 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3 @@ -2045,8 +2046,8 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v3 @@ -2209,24 +2210,24 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v1 @@ -2446,25 +2447,25 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) @@ -2722,30 +2723,30 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 -; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 @@ -2760,23 +2761,22 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v12 ; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v13 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v14 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7 @@ -2786,6 +2786,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15 ; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v15 ; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) @@ -3130,42 +3131,42 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v13 @@ -3177,13 +3178,13 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v15 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v14 ; GCN-HSA-NEXT: v_bfe_i32 v17, v15, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v15, v14, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[15:18] ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) @@ -3196,8 +3197,6 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_bfe_i32 v17, v9, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v15, v8, 0, 16 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[15:18] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[11:14] ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) @@ -3209,16 +3208,18 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v4 ; GCN-HSA-NEXT: v_bfe_i32 v13, v5, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v11, v4, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[11:14] ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v0 ; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v7, v0, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v2 ; GCN-HSA-NEXT: v_bfe_i32 v5, v3, 0, 16 @@ -3664,23 +3665,23 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s8, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s9 ; GCN-HSA-NEXT: s_add_u32 s10, s2, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s8 ; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s8 ; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[16:17] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 ; GCN-HSA-NEXT: s_add_u32 s10, s2, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 ; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: s_add_u32 s10, s2, 0x60 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -3688,8 +3689,8 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[12:13] ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s6 ; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[12:13] ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[14:15] @@ -3698,20 +3699,20 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v25 ; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v24 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v25 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 @@ -3733,21 +3734,21 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0 ; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v9 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v8 ; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v9 ; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v11 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v10 ; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v11 ; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[24:27] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 @@ -3755,14 +3756,14 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v5 ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v6 ; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v7 ; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) @@ -3770,9 +3771,10 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v29 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v28 ; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v29 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v31 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v30 ; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v31 @@ -3780,20 +3782,18 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[7:10] ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v3 ; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21 ; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v21 ; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v20 @@ -3804,42 +3804,43 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v22 ; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v23 ; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v22 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(12) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v18 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: s_waitcnt vmcnt(12) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v17 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v17 ; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v12 ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v13 ; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v17 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[15:18] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v14 ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v14 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -4468,28 +4469,28 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 64 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 @@ -4505,13 +4506,13 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 16, v29 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 16, v28 -; GCN-HSA-NEXT: v_bfe_i32 v34, v29, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v32, v28, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 16, v29 +; GCN-HSA-NEXT: v_bfe_i32 v34, v29, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s3 @@ -4525,87 +4526,86 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[31:34] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v20 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v21 ; GCN-HSA-NEXT: v_bfe_i32 v30, v21, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v28, v20, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[35:36], v[28:31] -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v34, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[35:36], v[28:31] +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v23 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v22 ; GCN-HSA-NEXT: v_bfe_i32 v30, v23, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v28, v22, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[28:31] -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v38, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v39, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v13 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v12 ; GCN-HSA-NEXT: v_bfe_i32 v22, v13, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v20, v12, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v38, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v15 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v14 ; GCN-HSA-NEXT: v_bfe_i32 v30, v15, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v28, v14, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v5 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v4 ; GCN-HSA-NEXT: v_bfe_i32 v14, v5, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v12, v4, 0, 16 +; GCN-HSA-NEXT: s_waitcnt vmcnt(7) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v7 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v6 ; GCN-HSA-NEXT: v_bfe_i32 v22, v7, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v20, v6, 0, 16 -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0 ; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v2 ; GCN-HSA-NEXT: v_bfe_i32 v22, v3, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v20, v2, 0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] ; GCN-HSA-NEXT: s_waitcnt vmcnt(11) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v8 -; GCN-HSA-NEXT: v_bfe_i32 v2, v9, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v8, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9 +; GCN-HSA-NEXT: v_bfe_i32 v2, v9, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v24 @@ -4614,9 +4614,9 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v10 ; GCN-HSA-NEXT: v_bfe_i32 v23, v11, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v21, v10, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[21:24] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: s_waitcnt vmcnt(12) @@ -4628,25 +4628,26 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v16 ; GCN-HSA-NEXT: v_bfe_i32 v19, v17, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v17, v16, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[17:20] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v25 ; GCN-HSA-NEXT: v_bfe_i32 v14, v25, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v27 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v26 ; GCN-HSA-NEXT: v_bfe_i32 v6, v27, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v26, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_endpgm @@ -5861,14 +5862,14 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx2 v[8:9], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v9 @@ -6013,17 +6014,18 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -6173,24 +6175,24 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v3 @@ -6395,17 +6397,17 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 @@ -6656,11 +6658,11 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8 @@ -6672,9 +6674,9 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v5 @@ -6686,26 +6688,22 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[17:20] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v3 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v3 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[7:10] ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v2 ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, v8 @@ -6713,9 +6711,13 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, v8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v0 ; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v0 @@ -7039,33 +7041,33 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v7 @@ -7506,17 +7508,17 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[2:5], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: flat_load_dwordx4 v[2:5], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[6:9], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 -; GCN-HSA-NEXT: flat_load_dwordx4 v[10:13], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[10:13], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[14:17], v[0:1] @@ -7534,12 +7536,12 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14 ; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, v1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) @@ -7575,9 +7577,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v17 ; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s9 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v15 ; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v15 @@ -7587,67 +7589,67 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v12 ; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v10 ; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v10 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v16 ; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v16 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[9:12] ; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v13 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v14 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[21:24] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[21:24] ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v8 ; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[18:21] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[18:21] ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 ; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v6 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, v1 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[15:18] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[5:8] ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12] @@ -8219,12 +8221,12 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -8233,9 +8235,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 @@ -8253,115 +8255,118 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_bfe_i32 v15, v15, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[15:18] -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 -; GCN-HSA-NEXT: v_bfe_i32 v15, v13, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GCN-HSA-NEXT: v_bfe_i32 v15, v13, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v13 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v13 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v14 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[15:18] -; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 ; GCN-HSA-NEXT: v_bfe_i32 v15, v13, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v13, v14, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[13:16] +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[13:16] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: v_bfe_i32 v12, v12, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[12:15] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v11 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v11 ; GCN-HSA-NEXT: v_bfe_i32 v11, v11, 0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v10 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 +; GCN-HSA-NEXT: v_bfe_i32 v15, v10, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v11, v9, 0, 16 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: v_bfe_i32 v17, v27, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 +; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v9 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v9 -; GCN-HSA-NEXT: v_bfe_i32 v15, v10, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-HSA-NEXT: v_bfe_i32 v10, v28, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v17, v27, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[11:14] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v14, v19, 0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[15:18] ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[8:11] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_bfe_i32 v14, v19, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[12:15] ; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v3 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v1 -; GCN-HSA-NEXT: v_bfe_i32 v20, v1, 0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_bfe_i32 v24, v0, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v26, v27, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v1 +; GCN-HSA-NEXT: v_bfe_i32 v20, v1, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v24 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v26 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v20 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v4 ; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GCN-HSA-NEXT: v_bfe_i32 v18, v18, 0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[10:13] ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_bfe_i32 v10, v19, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v5 @@ -8369,19 +8374,16 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v7 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v7 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCN-HSA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index 7203545ebf9a8..1706d1b7b97fa 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -6,6 +6,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX900-HSA %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX908-HSA %s + define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_load_i32: ; SI-NOHSA: ; %bb.0: ; %entry @@ -368,17 +369,17 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, s0 @@ -478,27 +479,27 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCNX3-HSA-NEXT: flat_load_dword v14, v[8:9] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -612,27 +613,27 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCNX3-HSA-NEXT: flat_load_dwordx2 v[8:9], v[8:9] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] @@ -745,27 +746,27 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCNX3-HSA-NEXT: flat_load_dwordx3 v[8:10], v[8:9] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s0 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[11:12], v[0:3] @@ -883,27 +884,27 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] @@ -1020,17 +1021,17 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s4 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -1038,17 +1039,17 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] @@ -1732,8 +1733,8 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2 @@ -1861,8 +1862,8 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v3 @@ -2003,22 +2004,22 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1) @@ -2196,25 +2197,25 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1) @@ -2431,42 +2432,42 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v9 @@ -2474,17 +2475,17 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v10 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v10 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v11 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4) @@ -2497,8 +2498,6 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v14 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v15 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(5) @@ -2510,16 +2509,18 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v6 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v7 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s1 +; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[8:11] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s1 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(6) ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v1 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s0 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2 @@ -2802,12 +2803,12 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 48 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -2819,52 +2820,52 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v1 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v5 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v6 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v7 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(5) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9 -; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v10 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v11 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[2:3], v[16:19] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -3203,30 +3204,30 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x60 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x50 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 64 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[8:9] ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[8:9] ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] @@ -3235,52 +3236,51 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v29 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v28 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, v28 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, v29 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v29 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, v29 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xf0 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v31 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v30 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, v30 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, v31 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[32:35] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(8) ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v25 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xd0 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v24 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v24 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v25 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v27 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v26 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v26 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v27 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[28:31] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v39, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v38, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9) @@ -3288,6 +3288,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v20 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v20 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v21 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v39, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v23 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v22 @@ -3296,86 +3297,86 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[24:27] ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v15 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v14 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v13 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v12 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v12 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v13 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, v14 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, v15 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v15 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v14 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, v14 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, v15 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v7 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v6 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v6 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v7 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[23:26] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v17 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v16 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v16 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v17 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v17 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[15:16], v[4:7] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v19 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v18 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v18 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v19 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[15:16], v[23:26] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v9 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v8 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, v8 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, v9 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v9 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, v9 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[15:18] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v1 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v0 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v11 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v11 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, v1 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v3 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v2 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v10 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v10 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, v3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] @@ -3663,7 +3664,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v12, 0 ; GCN-GFX900-HSA-NEXT: s_add_u32 s20, s20, s17 -; GCN-GFX900-HSA-NEXT: s_addc_u32 s21, s21, 0 ; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:96 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[8:11], v12, s[2:3] offset:112 @@ -3672,6 +3672,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[17:20], v12, s[2:3] offset:48 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3] offset:32 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] offset:16 +; GCN-GFX900-HSA-NEXT: s_addc_u32 s21, s21, 0 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v7 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v6 @@ -3833,13 +3834,10 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v19, v24 ; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3] ; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a0, v29 -; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a3, v32 ; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a1, v30 ; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a2, v31 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240 +; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a3, v32 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v28 -; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v36, a3 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v27 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v44, 31, v26 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v25 @@ -3847,7 +3845,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v43, v26 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v29, v27 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v31, v28 -; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v3 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v2 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v1 @@ -3856,10 +3854,13 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v27, v1 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v57, v2 ; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v59, v3 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240 +; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192 +; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v36, a3 ; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v35, a2 ; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v34, a1 ; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v33, a0 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192 ; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v24 ; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v23 @@ -3988,15 +3989,15 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_add_u32 s8, s2, 48 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s9, s3, 0 ; GCNX3-HSA-NEXT: s_add_u32 s10, s2, 64 ; GCNX3-HSA-NEXT: s_addc_u32 s11, s3, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_add_u32 s12, s2, 0x50 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s13, s3, 0 ; GCNX3-HSA-NEXT: s_add_u32 s14, s2, 0x60 ; GCNX3-HSA-NEXT: s_addc_u32 s15, s3, 0 @@ -4018,8 +4019,8 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] @@ -4033,74 +4034,73 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v30 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v31 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v31 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(8) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v33 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v34 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v35 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v35 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v24 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v25 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] -; GCNX3-HSA-NEXT: s_nop 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v26 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v27 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v20 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v21 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[0:3] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v22 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v23 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(11) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v16 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v17 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCNX3-HSA-NEXT: s_nop 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v18 @@ -4109,35 +4109,35 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v12 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v13 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v14 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v15 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v8 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v9 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v9 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v10 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v11 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v6 @@ -4516,34 +4516,34 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 48 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s5 -; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s4 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s5 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x50 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s4 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 64 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s4 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x70 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s5 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 0x60 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] -; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] @@ -4551,17 +4551,17 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s0 ; GCNX3-HSA-NEXT: s_add_u32 s4, s0, 0x70 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s0 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3] ; GCNX3-HSA-NEXT: s_nop 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_add_u32 s6, s0, 0x50 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s1, 0 @@ -4581,12 +4581,12 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[6:7], v[8:11] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, s6 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, s7 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[6:7], v[12:15] diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index 0c399d65d01cc..4974e74eecd36 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -5,6 +5,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=EG %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cayman < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=CM %s + ; TODO: NOT AND define amdgpu_kernel void @global_load_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_load_i8: @@ -253,13 +254,13 @@ define amdgpu_kernel void @global_load_v3i8(ptr addrspace(1) %out, ptr addrspace ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dword v2, v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GCN-HSA-NEXT: flat_store_short v[0:1], v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_byte v[0:1], v3 ; GCN-HSA-NEXT: s_endpgm @@ -1754,8 +1755,8 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i32(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v9 @@ -1912,8 +1913,8 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i32(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 24, v7 @@ -2085,14 +2086,14 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) @@ -2100,19 +2101,19 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_u32 v8, v3, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v3 ; GCN-HSA-NEXT: v_bfe_u32 v9, v3, 16, 8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GCN-HSA-NEXT: v_bfe_u32 v4, v0, 8, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[7:10] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 24, v2 +; GCN-HSA-NEXT: v_bfe_u32 v12, v2, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v2 +; GCN-HSA-NEXT: v_bfe_u32 v13, v2, 16, 8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GCN-HSA-NEXT: v_bfe_u32 v4, v0, 8, 8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 24, v1 ; GCN-HSA-NEXT: v_bfe_u32 v8, v1, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v12, v2, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GCN-HSA-NEXT: v_bfe_u32 v5, v0, 16, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v1 ; GCN-HSA-NEXT: v_bfe_u32 v9, v1, 16, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v2 -; GCN-HSA-NEXT: v_bfe_u32 v13, v2, 16, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14] @@ -2324,14 +2325,14 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) @@ -2339,19 +2340,19 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v9, v3, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v8, v3, 0, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v2 +; GCN-HSA-NEXT: v_bfe_i32 v14, v2, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v13, v2, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 24, v0 ; GCN-HSA-NEXT: v_bfe_i32 v6, v0, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v5, v0, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 24, v1 ; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 8 -; GCN-HSA-NEXT: v_bfe_i32 v14, v2, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v13, v2, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] @@ -2606,17 +2607,17 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 @@ -2626,22 +2627,22 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v7 ; GCN-HSA-NEXT: v_bfe_u32 v10, v7, 16, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 24, v6 ; GCN-HSA-NEXT: v_bfe_u32 v8, v6, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v6 ; GCN-HSA-NEXT: v_bfe_u32 v9, v6, 16, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v5 ; GCN-HSA-NEXT: v_bfe_u32 v7, v5, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v5 ; GCN-HSA-NEXT: v_bfe_u32 v8, v5, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[6:9] ; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff, v4 @@ -2655,23 +2656,23 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GCN-HSA-NEXT: v_bfe_u32 v5, v3, 8, 8 ; GCN-HSA-NEXT: v_bfe_u32 v6, v3, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; GCN-HSA-NEXT: v_bfe_u32 v9, v0, 8, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v2 +; GCN-HSA-NEXT: v_bfe_u32 v13, v2, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v2 +; GCN-HSA-NEXT: v_bfe_u32 v14, v2, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v0 +; GCN-HSA-NEXT: v_bfe_u32 v9, v0, 8, 8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GCN-HSA-NEXT: v_bfe_u32 v4, v1, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v13, v2, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v0 ; GCN-HSA-NEXT: v_bfe_u32 v10, v0, 16, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GCN-HSA-NEXT: v_bfe_u32 v5, v1, 16, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v2 -; GCN-HSA-NEXT: v_bfe_u32 v14, v2, 16, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[3:6] @@ -3017,17 +3018,17 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0 @@ -3037,15 +3038,14 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_i32 v8, v7, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v7, v7, 0, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 24, v6 ; GCN-HSA-NEXT: v_bfe_i32 v8, v6, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v6, v6, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[6:9] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 @@ -3053,6 +3053,7 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_i32 v7, v5, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[5:8] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 @@ -3066,23 +3067,23 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v9, v3, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v8, v3, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[8:11] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v2 +; GCN-HSA-NEXT: v_bfe_i32 v14, v2, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v13, v2, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 24, v0 ; GCN-HSA-NEXT: v_bfe_i32 v6, v0, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v5, v0, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[8:11] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 24, v1 ; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 8 -; GCN-HSA-NEXT: v_bfe_i32 v14, v2, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v13, v2, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] @@ -3571,117 +3572,117 @@ define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_u32 v18, v14, 16, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v15 ; GCN-HSA-NEXT: v_bfe_u32 v17, v15, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v15 ; GCN-HSA-NEXT: v_bfe_u32 v18, v15, 16, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[16:19] ; GCN-HSA-NEXT: v_bfe_u32 v15, v12, 8, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 24, v12 ; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff, v12 ; GCN-HSA-NEXT: v_bfe_u32 v16, v12, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[14:17] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 24, v13 ; GCN-HSA-NEXT: v_bfe_u32 v15, v13, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff, v13 ; GCN-HSA-NEXT: v_bfe_u32 v16, v13, 16, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[14:17] ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-HSA-NEXT: v_bfe_u32 v13, v10, 8, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s11 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v10 ; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v10 ; GCN-HSA-NEXT: v_bfe_u32 v14, v10, 16, 8 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s10 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v11 ; GCN-HSA-NEXT: v_bfe_u32 v13, v11, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v11 ; GCN-HSA-NEXT: v_bfe_u32 v14, v11, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s12 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v8 ; GCN-HSA-NEXT: v_bfe_u32 v11, v8, 8, 8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v8 ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v8 ; GCN-HSA-NEXT: v_bfe_u32 v12, v8, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[10:13] ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v9 ; GCN-HSA-NEXT: v_bfe_u32 v11, v9, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v9 ; GCN-HSA-NEXT: v_bfe_u32 v12, v9, 16, 8 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[10:13] ; GCN-HSA-NEXT: v_bfe_u32 v9, v3, 8, 8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v3 ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v3 ; GCN-HSA-NEXT: v_bfe_u32 v10, v3, 16, 8 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[8:11] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 24, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v0 ; GCN-HSA-NEXT: v_bfe_u32 v9, v0, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v0 ; GCN-HSA-NEXT: v_bfe_u32 v10, v0, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_bfe_u32 v18, v1, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v1 ; GCN-HSA-NEXT: v_bfe_u32 v19, v1, 16, 8 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v5 -; GCN-HSA-NEXT: v_bfe_u32 v14, v5, 8, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[8:11] -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v5 +; GCN-HSA-NEXT: s_waitcnt vmcnt(11) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v4 ; GCN-HSA-NEXT: v_bfe_u32 v9, v4, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v15, v5, 16, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v4 ; GCN-HSA-NEXT: v_bfe_u32 v10, v4, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_bfe_u32 v14, v5, 8, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[17:20] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 24, v6 ; GCN-HSA-NEXT: v_bfe_u32 v18, v6, 8, 8 +; GCN-HSA-NEXT: v_bfe_u32 v15, v5, 16, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v6 ; GCN-HSA-NEXT: v_bfe_u32 v19, v6, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 24, v7 ; GCN-HSA-NEXT: v_bfe_u32 v1, v7, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xff, v7 ; GCN-HSA-NEXT: v_bfe_u32 v2, v7, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[13:16] @@ -4332,9 +4333,9 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 24, v14 @@ -4348,13 +4349,13 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_i32 v16, v15, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v15, v15, 0, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[15:18] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[15:18] +; GCN-HSA-NEXT: v_bfe_i32 v14, v12, 0, 8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 24, v12 ; GCN-HSA-NEXT: v_bfe_i32 v16, v12, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v15, v12, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v14, v12, 0, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[14:17] @@ -4362,13 +4363,13 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 24, v13 ; GCN-HSA-NEXT: v_bfe_i32 v15, v13, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v14, v13, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[13:16] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 @@ -4380,10 +4381,10 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_i32 v14, v10, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v13, v10, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v12, v10, 0, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v11 ; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 16, 8 @@ -4398,59 +4399,60 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v11, v8, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v10, v8, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[10:13] ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[10:13] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 24, v9 ; GCN-HSA-NEXT: v_bfe_i32 v11, v9, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v10, v9, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v9, v9, 0, 8 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[9:12] ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v6 ; GCN-HSA-NEXT: v_bfe_i32 v13, v6, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v12, v6, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v11, v6, 0, 8 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 24, v7 ; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v8, v7, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v7, v7, 0, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 24, v4 ; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v7, v4, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v6, v4, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v1 -; GCN-HSA-NEXT: v_bfe_i32 v13, v1, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v12, v1, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v11, v1, 0, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[6:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[6:9] +; GCN-HSA-NEXT: s_waitcnt vmcnt(11) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 24, v0 ; GCN-HSA-NEXT: v_bfe_i32 v8, v0, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v7, v0, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v6, v0, 0, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_bfe_i32 v13, v1, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v12, v1, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v11, v1, 0, 8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 24, v5 ; GCN-HSA-NEXT: v_bfe_i32 v17, v5, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v15, v5, 0, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 24, v3 ; GCN-HSA-NEXT: v_bfe_i32 v17, v3, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v16, v3, 8, 8 @@ -4460,19 +4462,18 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[6:9] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[6:9] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] @@ -5751,14 +5752,14 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dword v0, v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_bfe_u32 v6, v0, 8, 8 @@ -5908,8 +5909,8 @@ define amdgpu_kernel void @global_sextload_v4i8_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v0 @@ -6077,30 +6078,30 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 24, v16 ; GCN-HSA-NEXT: v_bfe_u32 v0, v16, 16, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v15 ; GCN-HSA-NEXT: v_bfe_u32 v3, v15, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 ; GCN-HSA-NEXT: v_bfe_u32 v9, v16, 8, 8 ; GCN-HSA-NEXT: v_bfe_u32 v13, v15, 8, 8 @@ -6330,7 +6331,6 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_lshr_b32 s10, s3, 8 ; GCN-HSA-NEXT: s_lshr_b32 s12, s2, 8 ; GCN-HSA-NEXT: s_ashr_i32 s13, s2, 31 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[2:3], 0x80000 ; GCN-HSA-NEXT: s_ashr_i32 s16, s2, 24 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x80000 @@ -6338,35 +6338,36 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_endpgm @@ -6611,11 +6612,11 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v5 @@ -6627,51 +6628,51 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_u32 v8, v3, 16, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[8:11] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: v_bfe_u32 v11, v0, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] ; GCN-HSA-NEXT: v_mov_b32_e32 v15, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v1 ; GCN-HSA-NEXT: v_bfe_u32 v14, v1, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[14:17] ; GCN-HSA-NEXT: v_mov_b32_e32 v9, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[14:17] ; GCN-HSA-NEXT: v_bfe_u32 v10, v2, 8, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GCN-HSA-NEXT: v_bfe_u32 v4, v2, 16, 8 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, v5 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v16, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_bfe_u32 v13, v3, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v3 ; GCN-HSA-NEXT: v_bfe_u32 v9, v0, 8, 8 ; GCN-HSA-NEXT: v_bfe_u32 v17, v1, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xff, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[11:14] @@ -7070,64 +7071,64 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCN-HSA-NEXT: s_endpgm @@ -7596,71 +7597,71 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, v1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v4 ; GCN-HSA-NEXT: v_bfe_u32 v10, v4, 16, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v3 ; GCN-HSA-NEXT: v_bfe_u32 v10, v3, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v2 ; GCN-HSA-NEXT: v_bfe_u32 v10, v2, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v9 ; GCN-HSA-NEXT: v_bfe_u32 v10, v9, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v8 ; GCN-HSA-NEXT: v_bfe_u32 v10, v8, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v7 ; GCN-HSA-NEXT: v_bfe_u32 v10, v7, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v6 ; GCN-HSA-NEXT: v_bfe_u32 v10, v6, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_bfe_u32 v12, v5, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 ; GCN-HSA-NEXT: v_bfe_u32 v12, v4, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v4 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 @@ -7669,48 +7670,48 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 ; GCN-HSA-NEXT: v_bfe_u32 v12, v3, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: v_bfe_u32 v12, v2, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[10:13] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[10:13] ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GCN-HSA-NEXT: v_bfe_u32 v11, v9, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff, v9 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[9:12] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 24, v5 ; GCN-HSA-NEXT: v_bfe_u32 v0, v5, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_bfe_u32 v16, v8, 8, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_bfe_u32 v16, v8, 8, 8 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x80 ; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, v1 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x80 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[14:17] -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_bfe_u32 v9, v7, 8, 8 ; GCN-HSA-NEXT: v_bfe_u32 v13, v6, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v6 ; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[14:17] +; GCN-HSA-NEXT: v_mov_b32_e32 v8, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[7:10] ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] @@ -8439,12 +8440,12 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_add_u32 s54, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s55 ; GCN-HSA-NEXT: s_add_u32 s54, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s55 ; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s55 ; GCN-HSA-NEXT: s_add_u32 s54, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s55 ; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40 ; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xd0 @@ -8453,9 +8454,9 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s54 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s41 ; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xc0 ; GCN-HSA-NEXT: v_mov_b32_e32 v29, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s41 ; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] @@ -8464,11 +8465,11 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] @@ -8483,29 +8484,30 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_addc_u32 s25, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s18 ; GCN-HSA-NEXT: s_add_u32 s18, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s49 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 ; GCN-HSA-NEXT: s_addc_u32 s19, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s19 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] @@ -8514,9 +8516,9 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_add_u32 s16, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -8525,21 +8527,21 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xf0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xe0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xe0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 @@ -8552,10 +8554,10 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xb0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0xa0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -10834,8 +10836,8 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v0 ; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v1 @@ -10871,11 +10873,11 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff00ff ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s6 @@ -11316,8 +11318,8 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 ; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3 @@ -11365,11 +11367,11 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_or_b32 s2, s2, s17 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 @@ -12004,24 +12006,25 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_endpgm @@ -12815,8 +12818,8 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -12916,22 +12919,22 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll index 3fedd68edaea2..ed36a3d0036c8 100644 --- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s + ; Combine on select c, (load x), (load y) -> load (select c, x, y) ; drops MachinePointerInfo, so it can't be relied on for correctness. @@ -21,16 +22,16 @@ define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], ptr %p ; GCN-NEXT: s_cselect_b32 s2, s4, s5 ; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_add_u32 s2, s2, 4 -; GCN-NEXT: flat_load_dword v0, v[0:1] +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_addc_u32 s3, s3, 0 +; GCN-NEXT: flat_load_dword v0, v[0:1] ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: flat_load_dword v1, v[1:2] -; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -62,8 +63,8 @@ define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], ptr ; GCN-NEXT: s_cselect_b32 s3, s3, s6 ; GCN-NEXT: s_cselect_b32 s2, s4, s5 ; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 @@ -93,8 +94,8 @@ define amdgpu_kernel void @select_ptr_crash_i64_local(i32 %tmp, ptr addrspace(3) ; GCN-NEXT: s_cselect_b32 s0, s1, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b64 v[0:1], v0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -125,8 +126,8 @@ define amdgpu_kernel void @select_ptr_crash_i64_local_offsets(i32 %tmp, ptr addr ; GCN-NEXT: s_cselect_b32 s0, s1, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_read_b64 v[0:1], v0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -140,81 +141,3 @@ define amdgpu_kernel void @select_ptr_crash_i64_local_offsets(i32 %tmp, ptr addr store i64 %tmp5, ptr addrspace(1) %ptr2, align 8 ret void } - -; The resultant load cannot be treated as uniform -define amdgpu_kernel void @sample_test(ptr addrspace(1) %dest, ptr addrspace(1) %sourceA, ptr addrspace(1) %sourceB, i1 %tobool.not.i) #0 { -; GCN-LABEL: sample_test: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[4:5], 0x18 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp1_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NEXT: v_add_u32_e32 v3, vcc, s2, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; GCN-NEXT: s_endpgm -entry: - %0 = tail call i32 @llvm.amdgcn.workitem.id.x() - %conv2.i.i.i1 = zext i32 %0 to i64 - %arrayidx.i = getelementptr i64, ptr addrspace(1) %sourceA, i64 %conv2.i.i.i1 - %dest.gep = getelementptr i64, ptr addrspace(1) %dest, i64 %conv2.i.i.i1 - %ld0 = load i64, ptr addrspace(1) %arrayidx.i, align 8, !amdgpu.noclobber !0 - %ld1 = load i64, ptr addrspace(1) %sourceB, align 8 - %cond.i = select i1 %tobool.not.i, i64 %ld0, i64 %ld1 - store i64 %cond.i, ptr addrspace(1) %dest.gep, align 8 - ret void -} - -; The resultant load cannot be treated as uniform -define amdgpu_kernel void @constant_is_not_uniform(ptr addrspace(1) %dest, ptr addrspace(4) %sourceA, ptr addrspace(4) %sourceB, i1 %tobool.not.i) #0 { -; GCN-LABEL: constant_is_not_uniform: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[4:5], 0x18 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp1_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NEXT: v_add_u32_e32 v3, vcc, s2, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; GCN-NEXT: s_endpgm -entry: - %0 = tail call i32 @llvm.amdgcn.workitem.id.x() - %conv2.i.i.i1 = zext i32 %0 to i64 - %arrayidx.i = getelementptr i64, ptr addrspace(4) %sourceA, i64 %conv2.i.i.i1 - %dest.gep = getelementptr i64, ptr addrspace(1) %dest, i64 %conv2.i.i.i1 - %ld0 = load i64, ptr addrspace(4) %arrayidx.i, align 8 - %ld1 = load i64, ptr addrspace(4) %sourceB, align 8 - %cond.i = select i1 %tobool.not.i, i64 %ld0, i64 %ld1 - store i64 %cond.i, ptr addrspace(1) %dest.gep, align 8 - ret void -} - -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } - -!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll index 09225709a1acf..11bade44eb909 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll @@ -81,13 +81,13 @@ define amdgpu_kernel void @caller() { ; GFX9-SDAG-NEXT: s_add_u32 s4, s4, callee@gotpcrel32@lo+4 ; GFX9-SDAG-NEXT: s_addc_u32 s5, s5, callee@gotpcrel32@hi+12 ; GFX9-SDAG-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x0 -; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-SDAG-NEXT: s_mov_b32 s32, 0 @@ -111,13 +111,13 @@ define amdgpu_kernel void @caller() { ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s14 +; GFX9-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], s[12:13] ; GFX9-GISEL-NEXT: s_mov_b32 s12, s14 @@ -140,13 +140,13 @@ define amdgpu_kernel void @caller() { ; GFX9ARCH-SDAG-NEXT: s_add_u32 s4, s4, callee@gotpcrel32@lo+4 ; GFX9ARCH-SDAG-NEXT: s_addc_u32 s5, s5, callee@gotpcrel32@hi+12 ; GFX9ARCH-SDAG-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 -; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9ARCH-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9ARCH-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9ARCH-SDAG-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9ARCH-SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9ARCH-SDAG-NEXT: v_mov_b32_e32 v0, ttmp9 ; GFX9ARCH-SDAG-NEXT: s_mov_b32 s32, 0 @@ -169,13 +169,13 @@ define amdgpu_kernel void @caller() { ; GFX9ARCH-GISEL-NEXT: s_add_u32 s0, s0, callee@gotpcrel32@lo+4 ; GFX9ARCH-GISEL-NEXT: s_addc_u32 s1, s1, callee@gotpcrel32@hi+12 ; GFX9ARCH-GISEL-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x0 -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9ARCH-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9ARCH-GISEL-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9ARCH-GISEL-NEXT: v_mov_b32_e32 v0, ttmp9 +; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9ARCH-GISEL-NEXT: s_mov_b64 s[4:5], s[12:13] ; GFX9ARCH-GISEL-NEXT: s_mov_b32 s32, 0 diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll index 2f3ca8b795f7d..044303ac3d67d 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll @@ -104,11 +104,11 @@ define amdgpu_cs void @_amdgpu_cs_main() { ; GFX1250-GISEL-NEXT: s_add_co_i32 s3, s3, 1 ; GFX1250-GISEL-NEXT: s_bfe_u32 s5, ttmp6, 0x40008 ; GFX1250-GISEL-NEXT: s_mul_i32 s3, s4, s3 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX1250-GISEL-NEXT: s_add_co_i32 s5, s5, s3 ; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s2, 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s4, s5 -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GFX1250-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], null ; GFX1250-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index fc8467cb73ab6..228d7a397751d 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -1093,8 +1093,9 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; GFX11-NEXT: s_mul_hi_u32 s3, s2, s3 ; GFX11-NEXT: s_add_u32 s2, s6, s4 ; GFX11-NEXT: s_addc_u32 s3, s3, s5 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -1111,8 +1112,9 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_mul_u64 s[2:3], s[6:7], s[2:3] ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll index af713179a888d..607c3cbfce616 100644 --- a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll +++ b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll @@ -93,12 +93,12 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1f @@ -176,12 +176,12 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) { ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_mov_b32 s1, s0 ; GFX12-NEXT: s_mov_b32 s2, s0 ; GFX12-NEXT: s_mov_b32 s3, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1f diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll index faf70f55876f7..de2135a3bfd74 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll @@ -165,9 +165,9 @@ define void @issue63986_reduced_expanded(i64 %idxprom) { ; CHECK-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; CHECK-NEXT: s_cbranch_execnz .LBB1_8 ; CHECK-NEXT: .LBB1_5: ; %loop-memcpy-residual.preheader -; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: s_mov_b64 s[8:9], 0 ; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: .LBB1_6: ; %loop-memcpy-residual ; CHECK-NEXT: s_add_i32 s6, s8, 1 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 5b7c36559a366..81dcb7aaad545 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -94,12 +94,11 @@ entry: define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 { ; CHECK-LABEL: memcpy_p5_p4_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 -; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v24, 0 -; CHECK-NEXT: s_add_u32 s20, s20, s17 +; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] +; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 @@ -107,6 +106,7 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 ; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 ; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 +; CHECK-NEXT: s_add_u32 s20, s20, s17 ; CHECK-NEXT: s_addc_u32 s21, s21, 0 ; CHECK-NEXT: v_mov_b32_e32 v25, s2 ; CHECK-NEXT: s_waitcnt vmcnt(5) @@ -160,11 +160,11 @@ entry: define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 { ; CHECK-LABEL: memcpy_p0_p5_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_add_u32 s20, s20, s17 ; CHECK-NEXT: s_addc_u32 s21, s21, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -382,12 +382,11 @@ entry: define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 { ; CHECK-LABEL: memcpy_p5_p4_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 -; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v24, 0 -; CHECK-NEXT: s_add_u32 s20, s20, s17 +; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] +; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 @@ -395,6 +394,7 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 ; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 ; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 +; CHECK-NEXT: s_add_u32 s20, s20, s17 ; CHECK-NEXT: s_addc_u32 s21, s21, 0 ; CHECK-NEXT: v_mov_b32_e32 v25, s2 ; CHECK-NEXT: s_waitcnt vmcnt(5) @@ -448,11 +448,11 @@ entry: define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 { ; CHECK-LABEL: memcpy_p0_p5_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_add_u32 s20, s20, s17 ; CHECK-NEXT: s_addc_u32 s21, s21, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll index d95965caa81ab..c936a73c07a56 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 %s -o - | FileCheck %s + ; Check code generation for memmoves with statically unknown size and all ; combinations of the following address spaces: ; destination address space: 0, 1, 3, 5 @@ -32,10 +33,10 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_cbranch_execz .LBB0_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 -; CHECK-NEXT: v_mov_b32_e32 v11, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: v_mov_b32_e32 v11, v1 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 +; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 @@ -129,8 +130,8 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo ; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v10, -16 @@ -180,10 +181,10 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: s_cbranch_execz .LBB1_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 -; CHECK-NEXT: v_mov_b32_e32 v11, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: v_mov_b32_e32 v11, v1 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 +; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 @@ -277,8 +278,8 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB1_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo ; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v10, -16 @@ -330,8 +331,8 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_cbranch_execz .LBB2_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v10, v1 -; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: v_mov_b32_e32 v9, v0 +; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: v_mov_b32_e32 v11, v5 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: s_mov_b32 s9, 0 @@ -472,10 +473,10 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: s_cbranch_execz .LBB3_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 -; CHECK-NEXT: v_mov_b32_e32 v11, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: v_mov_b32_e32 v11, v1 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 +; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 @@ -569,8 +570,8 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB3_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo ; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v10, -16 @@ -622,8 +623,8 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: s_cbranch_execz .LBB4_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v10, v1 -; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: v_mov_b32_e32 v9, v0 +; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: v_mov_b32_e32 v11, v5 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: s_mov_b32 s9, 0 @@ -772,10 +773,10 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_cbranch_execz .LBB5_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 -; CHECK-NEXT: v_mov_b32_e32 v11, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: v_mov_b32_e32 v11, v1 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 +; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 @@ -869,8 +870,8 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB5_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo ; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v10, -16 @@ -918,10 +919,10 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: s_cbranch_execz .LBB6_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 -; CHECK-NEXT: v_mov_b32_e32 v11, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: v_mov_b32_e32 v11, v1 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 +; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 @@ -1015,8 +1016,8 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB6_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo ; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v10, -16 @@ -1127,10 +1128,10 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: s_cbranch_execz .LBB8_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 -; CHECK-NEXT: v_mov_b32_e32 v11, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: v_mov_b32_e32 v11, v1 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 +; CHECK-NEXT: v_mov_b32_e32 v13, v9 ; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 @@ -1224,8 +1225,8 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB8_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_mov_b32_e32 v10, v8 +; CHECK-NEXT: v_mov_b32_e32 v11, v9 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, vcc_lo ; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v10, -16 @@ -1347,8 +1348,8 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_cbranch_execz .LBB10_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v10, v2 -; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: v_mov_b32_e32 v9, v1 +; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: v_mov_b32_e32 v11, v7 ; CHECK-NEXT: v_mov_b32_e32 v4, v0 ; CHECK-NEXT: s_mov_b32 s9, 0 @@ -1813,8 +1814,8 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_cbranch_execz .LBB15_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v10, v2 -; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: v_mov_b32_e32 v9, v1 +; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: v_mov_b32_e32 v11, v7 ; CHECK-NEXT: v_mov_b32_e32 v4, v0 ; CHECK-NEXT: s_mov_b32 s8, 0 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index e6fd6aba92cf0..f52bcf52f9f3b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -388,14 +388,14 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000 +; GCN-NEXT: s_brev_b32 s0, 1 ; GCN-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_brev_b32 s0, 1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: s_mov_b32 s3, 0 ; GCN-NEXT: s_mov_b32 s1, s0 ; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll index 9585c486aeb9e..0057c4da1196d 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll @@ -308,40 +308,47 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr ; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 ; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 ; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GFX908-NEXT: s_waitcnt vmcnt(7) +; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX908-NEXT: s_waitcnt vmcnt(6) +; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX908-NEXT: s_waitcnt vmcnt(5) +; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX908-NEXT: s_waitcnt vmcnt(4) +; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX908-NEXT: s_waitcnt vmcnt(3) +; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX908-NEXT: s_waitcnt vmcnt(2) +; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX908-NEXT: s_waitcnt vmcnt(1) +; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX908-NEXT: v_accvgpr_write_b32 a5, v5 ; GFX908-NEXT: v_accvgpr_write_b32 a6, v6 ; GFX908-NEXT: v_accvgpr_write_b32 a7, v7 -; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 ; GFX908-NEXT: v_accvgpr_write_b32 a9, v9 ; GFX908-NEXT: v_accvgpr_write_b32 a10, v10 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v11 -; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 ; GFX908-NEXT: v_accvgpr_write_b32 a13, v13 ; GFX908-NEXT: v_accvgpr_write_b32 a14, v14 ; GFX908-NEXT: v_accvgpr_write_b32 a15, v15 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 ; GFX908-NEXT: v_accvgpr_write_b32 a17, v17 ; GFX908-NEXT: v_accvgpr_write_b32 a18, v18 ; GFX908-NEXT: v_accvgpr_write_b32 a19, v19 -; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 ; GFX908-NEXT: v_accvgpr_write_b32 a21, v21 ; GFX908-NEXT: v_accvgpr_write_b32 a22, v22 ; GFX908-NEXT: v_accvgpr_write_b32 a23, v23 -; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 ; GFX908-NEXT: v_accvgpr_write_b32 a25, v25 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v26 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v27 -; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] @@ -414,40 +421,47 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr add ; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 ; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 ; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GFX908-NEXT: s_waitcnt vmcnt(7) +; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX908-NEXT: s_waitcnt vmcnt(6) +; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX908-NEXT: s_waitcnt vmcnt(5) +; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX908-NEXT: s_waitcnt vmcnt(4) +; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX908-NEXT: s_waitcnt vmcnt(3) +; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX908-NEXT: s_waitcnt vmcnt(2) +; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX908-NEXT: s_waitcnt vmcnt(1) +; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX908-NEXT: v_accvgpr_write_b32 a5, v5 ; GFX908-NEXT: v_accvgpr_write_b32 a6, v6 ; GFX908-NEXT: v_accvgpr_write_b32 a7, v7 -; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 ; GFX908-NEXT: v_accvgpr_write_b32 a9, v9 ; GFX908-NEXT: v_accvgpr_write_b32 a10, v10 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v11 -; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 ; GFX908-NEXT: v_accvgpr_write_b32 a13, v13 ; GFX908-NEXT: v_accvgpr_write_b32 a14, v14 ; GFX908-NEXT: v_accvgpr_write_b32 a15, v15 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 ; GFX908-NEXT: v_accvgpr_write_b32 a17, v17 ; GFX908-NEXT: v_accvgpr_write_b32 a18, v18 ; GFX908-NEXT: v_accvgpr_write_b32 a19, v19 -; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 ; GFX908-NEXT: v_accvgpr_write_b32 a21, v21 ; GFX908-NEXT: v_accvgpr_write_b32 a22, v22 ; GFX908-NEXT: v_accvgpr_write_b32 a23, v23 -; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 ; GFX908-NEXT: v_accvgpr_write_b32 a25, v25 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v26 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v27 -; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] @@ -520,40 +534,47 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addr ; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 ; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 ; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GFX908-NEXT: s_waitcnt vmcnt(7) +; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX908-NEXT: s_waitcnt vmcnt(6) +; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX908-NEXT: s_waitcnt vmcnt(5) +; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX908-NEXT: s_waitcnt vmcnt(4) +; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX908-NEXT: s_waitcnt vmcnt(3) +; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX908-NEXT: s_waitcnt vmcnt(2) +; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX908-NEXT: s_waitcnt vmcnt(1) +; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX908-NEXT: v_accvgpr_write_b32 a5, v5 ; GFX908-NEXT: v_accvgpr_write_b32 a6, v6 ; GFX908-NEXT: v_accvgpr_write_b32 a7, v7 -; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 ; GFX908-NEXT: v_accvgpr_write_b32 a9, v9 ; GFX908-NEXT: v_accvgpr_write_b32 a10, v10 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v11 -; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 ; GFX908-NEXT: v_accvgpr_write_b32 a13, v13 ; GFX908-NEXT: v_accvgpr_write_b32 a14, v14 ; GFX908-NEXT: v_accvgpr_write_b32 a15, v15 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 ; GFX908-NEXT: v_accvgpr_write_b32 a17, v17 ; GFX908-NEXT: v_accvgpr_write_b32 a18, v18 ; GFX908-NEXT: v_accvgpr_write_b32 a19, v19 -; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 ; GFX908-NEXT: v_accvgpr_write_b32 a21, v21 ; GFX908-NEXT: v_accvgpr_write_b32 a22, v22 ; GFX908-NEXT: v_accvgpr_write_b32 a23, v23 -; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 ; GFX908-NEXT: v_accvgpr_write_b32 a25, v25 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v26 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v27 -; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] @@ -627,14 +648,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) ; GFX908-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 ; GFX908-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 ; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX908-NEXT: s_mov_b32 s14, s10 -; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX908-NEXT: s_mov_b32 s14, s10 +; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX908-NEXT: s_mov_b32 s32, 0 ; GFX908-NEXT: v_mov_b32_e32 v40, 0 @@ -648,40 +669,47 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) ; GFX908-NEXT: global_load_dwordx4 v[8:11], v40, s[34:35] offset:32 ; GFX908-NEXT: global_load_dwordx4 v[4:7], v40, s[34:35] offset:16 ; GFX908-NEXT: global_load_dwordx4 v[0:3], v40, s[34:35] +; GFX908-NEXT: s_waitcnt vmcnt(7) +; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX908-NEXT: s_waitcnt vmcnt(6) +; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX908-NEXT: s_waitcnt vmcnt(5) +; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX908-NEXT: s_waitcnt vmcnt(4) +; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX908-NEXT: s_waitcnt vmcnt(3) +; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX908-NEXT: s_waitcnt vmcnt(2) +; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX908-NEXT: s_waitcnt vmcnt(1) +; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 ; GFX908-NEXT: v_accvgpr_write_b32 a5, v5 ; GFX908-NEXT: v_accvgpr_write_b32 a6, v6 ; GFX908-NEXT: v_accvgpr_write_b32 a7, v7 -; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 ; GFX908-NEXT: v_accvgpr_write_b32 a9, v9 ; GFX908-NEXT: v_accvgpr_write_b32 a10, v10 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v11 -; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 ; GFX908-NEXT: v_accvgpr_write_b32 a13, v13 ; GFX908-NEXT: v_accvgpr_write_b32 a14, v14 ; GFX908-NEXT: v_accvgpr_write_b32 a15, v15 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 ; GFX908-NEXT: v_accvgpr_write_b32 a17, v17 ; GFX908-NEXT: v_accvgpr_write_b32 a18, v18 ; GFX908-NEXT: v_accvgpr_write_b32 a19, v19 -; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 ; GFX908-NEXT: v_accvgpr_write_b32 a21, v21 ; GFX908-NEXT: v_accvgpr_write_b32 a22, v22 ; GFX908-NEXT: v_accvgpr_write_b32 a23, v23 -; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 ; GFX908-NEXT: v_accvgpr_write_b32 a25, v25 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v26 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v27 -; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] @@ -897,10 +925,10 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace( ; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX908-NEXT: s_mov_b64 s[0:1], s[52:53] -; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX908-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -933,40 +961,47 @@ define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0 ; GFX908-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 ; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 ; GFX908-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX908-NEXT: s_waitcnt vmcnt(7) +; GFX908-NEXT: v_accvgpr_write_b32 a28, v30 +; GFX908-NEXT: s_waitcnt vmcnt(6) +; GFX908-NEXT: v_accvgpr_write_b32 a24, v26 +; GFX908-NEXT: s_waitcnt vmcnt(5) +; GFX908-NEXT: v_accvgpr_write_b32 a20, v22 +; GFX908-NEXT: s_waitcnt vmcnt(4) +; GFX908-NEXT: v_accvgpr_write_b32 a16, v18 +; GFX908-NEXT: s_waitcnt vmcnt(3) +; GFX908-NEXT: v_accvgpr_write_b32 a12, v14 +; GFX908-NEXT: s_waitcnt vmcnt(2) +; GFX908-NEXT: v_accvgpr_write_b32 a8, v10 +; GFX908-NEXT: s_waitcnt vmcnt(1) +; GFX908-NEXT: v_accvgpr_write_b32 a4, v6 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v4 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v5 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v6 ; GFX908-NEXT: v_accvgpr_write_b32 a5, v7 ; GFX908-NEXT: v_accvgpr_write_b32 a6, v8 ; GFX908-NEXT: v_accvgpr_write_b32 a7, v9 -; GFX908-NEXT: v_accvgpr_write_b32 a8, v10 ; GFX908-NEXT: v_accvgpr_write_b32 a9, v11 ; GFX908-NEXT: v_accvgpr_write_b32 a10, v12 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v13 -; GFX908-NEXT: v_accvgpr_write_b32 a12, v14 ; GFX908-NEXT: v_accvgpr_write_b32 a13, v15 ; GFX908-NEXT: v_accvgpr_write_b32 a14, v16 ; GFX908-NEXT: v_accvgpr_write_b32 a15, v17 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v18 ; GFX908-NEXT: v_accvgpr_write_b32 a17, v19 ; GFX908-NEXT: v_accvgpr_write_b32 a18, v20 ; GFX908-NEXT: v_accvgpr_write_b32 a19, v21 -; GFX908-NEXT: v_accvgpr_write_b32 a20, v22 ; GFX908-NEXT: v_accvgpr_write_b32 a21, v23 ; GFX908-NEXT: v_accvgpr_write_b32 a22, v24 ; GFX908-NEXT: v_accvgpr_write_b32 a23, v25 -; GFX908-NEXT: v_accvgpr_write_b32 a24, v26 ; GFX908-NEXT: v_accvgpr_write_b32 a25, v27 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v28 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v29 -; GFX908-NEXT: v_accvgpr_write_b32 a28, v30 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v31 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v32 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v33 -; GFX908-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX908-NEXT: v_mov_b32_e32 v3, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] @@ -1034,40 +1069,47 @@ define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg) ; GFX908-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 ; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 ; GFX908-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX908-NEXT: s_waitcnt vmcnt(7) +; GFX908-NEXT: v_accvgpr_write_b32 a28, v30 +; GFX908-NEXT: s_waitcnt vmcnt(6) +; GFX908-NEXT: v_accvgpr_write_b32 a24, v26 +; GFX908-NEXT: s_waitcnt vmcnt(5) +; GFX908-NEXT: v_accvgpr_write_b32 a20, v22 +; GFX908-NEXT: s_waitcnt vmcnt(4) +; GFX908-NEXT: v_accvgpr_write_b32 a16, v18 +; GFX908-NEXT: s_waitcnt vmcnt(3) +; GFX908-NEXT: v_accvgpr_write_b32 a12, v14 +; GFX908-NEXT: s_waitcnt vmcnt(2) +; GFX908-NEXT: v_accvgpr_write_b32 a8, v10 +; GFX908-NEXT: s_waitcnt vmcnt(1) +; GFX908-NEXT: v_accvgpr_write_b32 a4, v6 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v4 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v5 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v6 ; GFX908-NEXT: v_accvgpr_write_b32 a5, v7 ; GFX908-NEXT: v_accvgpr_write_b32 a6, v8 ; GFX908-NEXT: v_accvgpr_write_b32 a7, v9 -; GFX908-NEXT: v_accvgpr_write_b32 a8, v10 ; GFX908-NEXT: v_accvgpr_write_b32 a9, v11 ; GFX908-NEXT: v_accvgpr_write_b32 a10, v12 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v13 -; GFX908-NEXT: v_accvgpr_write_b32 a12, v14 ; GFX908-NEXT: v_accvgpr_write_b32 a13, v15 ; GFX908-NEXT: v_accvgpr_write_b32 a14, v16 ; GFX908-NEXT: v_accvgpr_write_b32 a15, v17 -; GFX908-NEXT: v_accvgpr_write_b32 a16, v18 ; GFX908-NEXT: v_accvgpr_write_b32 a17, v19 ; GFX908-NEXT: v_accvgpr_write_b32 a18, v20 ; GFX908-NEXT: v_accvgpr_write_b32 a19, v21 -; GFX908-NEXT: v_accvgpr_write_b32 a20, v22 ; GFX908-NEXT: v_accvgpr_write_b32 a21, v23 ; GFX908-NEXT: v_accvgpr_write_b32 a22, v24 ; GFX908-NEXT: v_accvgpr_write_b32 a23, v25 -; GFX908-NEXT: v_accvgpr_write_b32 a24, v26 ; GFX908-NEXT: v_accvgpr_write_b32 a25, v27 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v28 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v29 -; GFX908-NEXT: v_accvgpr_write_b32 a28, v30 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v31 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v32 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v33 -; GFX908-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX908-NEXT: v_mov_b32_e32 v3, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index 8b6bb9b8c5fcd..b3d6ac4c7e9ca 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -4,6 +4,7 @@ ; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s + ; Check that we do not copy agprs to vgprs and back inside the loop. ; Final result should be read only once after the loop. @@ -55,12 +56,12 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -414,12 +415,12 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -767,12 +768,12 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -1157,12 +1158,12 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -1510,12 +1511,12 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -1902,12 +1903,12 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -2295,12 +2296,12 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -2658,12 +2659,12 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -2863,12 +2864,12 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 13 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -3272,12 +3273,12 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX908-NEXT: ; %bb.4: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 10 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 @@ -3479,779 +3480,6 @@ exit: ret void } -; Phi exit use is vgpr abi use -define <32 x float> @test_mfma_loop_zeroinit_ret_use() #0 { -; GFX908-LABEL: test_mfma_loop_zeroinit_ret_use: -; GFX908: ; %bb.0: ; %entry -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX908-NEXT: s_mov_b32 s4, 16 -; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX908-NEXT: .LBB10_1: ; %for.cond.preheader -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] -; GFX908-NEXT: s_add_i32 s4, s4, -1 -; GFX908-NEXT: s_cmp_lg_u32 s4, 0 -; GFX908-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX908-NEXT: ; %bb.2: ; %exit -; GFX908-NEXT: s_nop 14 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: test_mfma_loop_zeroinit_ret_use: -; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v11, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v13, 0 -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v15, 0 -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v17, 0 -; GFX90A-NEXT: v_mov_b32_e32 v18, 0 -; GFX90A-NEXT: v_mov_b32_e32 v19, 0 -; GFX90A-NEXT: v_mov_b32_e32 v20, 0 -; GFX90A-NEXT: v_mov_b32_e32 v21, 0 -; GFX90A-NEXT: v_mov_b32_e32 v22, 0 -; GFX90A-NEXT: v_mov_b32_e32 v23, 0 -; GFX90A-NEXT: v_mov_b32_e32 v24, 0 -; GFX90A-NEXT: v_mov_b32_e32 v25, 0 -; GFX90A-NEXT: v_mov_b32_e32 v26, 0 -; GFX90A-NEXT: v_mov_b32_e32 v27, 0 -; GFX90A-NEXT: v_mov_b32_e32 v28, 0 -; GFX90A-NEXT: v_mov_b32_e32 v29, 0 -; GFX90A-NEXT: v_mov_b32_e32 v30, 0 -; GFX90A-NEXT: v_mov_b32_e32 v31, 0 -; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 -; GFX90A-NEXT: .LBB10_1: ; %for.cond.preheader -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: s_add_i32 s4, s4, -1 -; GFX90A-NEXT: s_cmp_lg_u32 s4, 0 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] -; GFX90A-NEXT: s_nop 15 -; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX90A-NEXT: ; %bb.2: ; %exit -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: test_mfma_loop_zeroinit_ret_use: -; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v9, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v11, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v13, 0 -; GFX942-NEXT: v_mov_b32_e32 v14, 0 -; GFX942-NEXT: v_mov_b32_e32 v15, 0 -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v17, 0 -; GFX942-NEXT: v_mov_b32_e32 v18, 0 -; GFX942-NEXT: v_mov_b32_e32 v19, 0 -; GFX942-NEXT: v_mov_b32_e32 v20, 0 -; GFX942-NEXT: v_mov_b32_e32 v21, 0 -; GFX942-NEXT: v_mov_b32_e32 v22, 0 -; GFX942-NEXT: v_mov_b32_e32 v23, 0 -; GFX942-NEXT: v_mov_b32_e32 v24, 0 -; GFX942-NEXT: v_mov_b32_e32 v25, 0 -; GFX942-NEXT: v_mov_b32_e32 v26, 0 -; GFX942-NEXT: v_mov_b32_e32 v27, 0 -; GFX942-NEXT: v_mov_b32_e32 v28, 0 -; GFX942-NEXT: v_mov_b32_e32 v29, 0 -; GFX942-NEXT: v_mov_b32_e32 v30, 0 -; GFX942-NEXT: v_mov_b32_e32 v31, 0 -; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 -; GFX942-NEXT: .LBB10_1: ; %for.cond.preheader -; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: s_add_i32 s0, s0, -1 -; GFX942-NEXT: s_cmp_lg_u32 s0, 0 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] -; GFX942-NEXT: s_nop 15 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX942-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX942-NEXT: ; %bb.2: ; %exit -; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX942-NEXT: s_setpc_b64 s[30:31] -entry: - br label %for.cond.preheader - -for.cond.preheader: - %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ] - %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] - %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) - %inc = add nuw nsw i32 %c, 1 - %cc = icmp eq i32 %inc, 16 - br i1 %cc, label %exit, label %for.cond.preheader - -exit: - ret <32 x float> %mai.1 -} - -define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { -; GFX908-LABEL: test_mfma_loop_non_splat_ret_use: -; GFX908: ; %bb.0: ; %entry -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_accvgpr_write_b32 a1, 1.0 -; GFX908-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a0, 0 -; GFX908-NEXT: s_mov_b32 s4, 16 -; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX908-NEXT: .LBB11_1: ; %for.cond.preheader -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_nop 1 -; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GFX908-NEXT: s_add_i32 s4, s4, -1 -; GFX908-NEXT: s_cmp_lg_u32 s4, 0 -; GFX908-NEXT: s_cbranch_scc1 .LBB11_1 -; GFX908-NEXT: ; %bb.2: ; %exit -; GFX908-NEXT: s_nop 14 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: test_mfma_loop_non_splat_ret_use: -; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s4, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v9, 0 -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v11, 0 -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 -; GFX90A-NEXT: v_mov_b32_e32 v13, 0 -; GFX90A-NEXT: v_mov_b32_e32 v14, 0 -; GFX90A-NEXT: v_mov_b32_e32 v15, 0 -; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v17, 0 -; GFX90A-NEXT: v_mov_b32_e32 v18, 0 -; GFX90A-NEXT: v_mov_b32_e32 v19, 0 -; GFX90A-NEXT: v_mov_b32_e32 v20, 0 -; GFX90A-NEXT: v_mov_b32_e32 v21, 0 -; GFX90A-NEXT: v_mov_b32_e32 v22, 0 -; GFX90A-NEXT: v_mov_b32_e32 v23, 0 -; GFX90A-NEXT: v_mov_b32_e32 v24, 0 -; GFX90A-NEXT: v_mov_b32_e32 v25, 0 -; GFX90A-NEXT: v_mov_b32_e32 v26, 0 -; GFX90A-NEXT: v_mov_b32_e32 v27, 0 -; GFX90A-NEXT: v_mov_b32_e32 v28, 0 -; GFX90A-NEXT: v_mov_b32_e32 v29, 0 -; GFX90A-NEXT: v_mov_b32_e32 v30, 0 -; GFX90A-NEXT: v_mov_b32_e32 v31, 0 -; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 -; GFX90A-NEXT: .LBB11_1: ; %for.cond.preheader -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: s_add_i32 s4, s4, -1 -; GFX90A-NEXT: s_cmp_lg_u32 s4, 0 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] -; GFX90A-NEXT: s_nop 15 -; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 -; GFX90A-NEXT: ; %bb.2: ; %exit -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: test_mfma_loop_non_splat_ret_use: -; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v9, 0 -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_mov_b32_e32 v11, 0 -; GFX942-NEXT: v_mov_b32_e32 v12, 0 -; GFX942-NEXT: v_mov_b32_e32 v13, 0 -; GFX942-NEXT: v_mov_b32_e32 v14, 0 -; GFX942-NEXT: v_mov_b32_e32 v15, 0 -; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v17, 0 -; GFX942-NEXT: v_mov_b32_e32 v18, 0 -; GFX942-NEXT: v_mov_b32_e32 v19, 0 -; GFX942-NEXT: v_mov_b32_e32 v20, 0 -; GFX942-NEXT: v_mov_b32_e32 v21, 0 -; GFX942-NEXT: v_mov_b32_e32 v22, 0 -; GFX942-NEXT: v_mov_b32_e32 v23, 0 -; GFX942-NEXT: v_mov_b32_e32 v24, 0 -; GFX942-NEXT: v_mov_b32_e32 v25, 0 -; GFX942-NEXT: v_mov_b32_e32 v26, 0 -; GFX942-NEXT: v_mov_b32_e32 v27, 0 -; GFX942-NEXT: v_mov_b32_e32 v28, 0 -; GFX942-NEXT: v_mov_b32_e32 v29, 0 -; GFX942-NEXT: v_mov_b32_e32 v30, 0 -; GFX942-NEXT: v_mov_b32_e32 v31, 0 -; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 -; GFX942-NEXT: .LBB11_1: ; %for.cond.preheader -; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: s_add_i32 s0, s0, -1 -; GFX942-NEXT: s_cmp_lg_u32 s0, 0 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] -; GFX942-NEXT: s_nop 15 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX942-NEXT: s_cbranch_scc1 .LBB11_1 -; GFX942-NEXT: ; %bb.2: ; %exit -; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 -; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 -; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 -; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 -; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 -; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 -; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 -; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 -; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 -; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 -; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 -; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 -; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 -; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 -; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 -; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 -; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 -; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 -; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 -; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 -; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 -; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 -; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 -; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 -; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 -; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 -; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 -; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 -; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 -; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 -; GFX942-NEXT: s_setpc_b64 s[30:31] -entry: - br label %for.cond.preheader - -for.cond.preheader: - %phi = phi <32 x float> [ , %entry ], [ %mai.1, %for.cond.preheader ] - %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] - %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) - %inc = add nuw nsw i32 %c, 1 - %cc = icmp eq i32 %inc, 16 - br i1 %cc, label %exit, label %for.cond.preheader - -exit: - ret <32 x float> %mai.1 -} - declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll index c8cc40faf1e84..f9295e42f55c7 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -408,7 +408,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-GISEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 ; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a16, s16 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 @@ -424,6 +423,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 +; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a16, s16 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a17, s17 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a18, s18 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a19, s19 diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index eff0680fe9a31..0aa665594086a 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -357,11 +357,11 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; CI-NEXT: s_min_i32 s2, s2, s6 ; CI-NEXT: s_min_i32 s1, s1, s5 ; CI-NEXT: s_min_i32 s0, s0, s4 -; CI-NEXT: v_mov_b32_e32 v4, s8 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_mov_b32_e32 v4, s8 ; CI-NEXT: v_mov_b32_e32 v5, s9 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm @@ -378,11 +378,11 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; VI-NEXT: s_min_i32 s2, s2, s6 ; VI-NEXT: s_min_i32 s1, s1, s5 ; VI-NEXT: s_min_i32 s0, s0, s4 -; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v5, s9 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -506,8 +506,8 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: s_sext_i32_i8 s3, s3 ; CI-NEXT: s_min_i32 s2, s2, s3 -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_byte v[0:1], v2 @@ -524,8 +524,8 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: s_sext_i32_i8 s3, s3 ; VI-NEXT: s_min_i32 s2, s3, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -682,8 +682,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; CI-NEXT: s_or_b32 s3, s4, s3 ; CI-NEXT: s_and_b32 s3, s3, 0xffff ; CI-NEXT: s_or_b32 s2, s3, s2 -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -720,8 +720,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; VI-NEXT: s_lshl_b32 s4, s4, 16 ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_or_b32 s2, s2, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1098,9 +1098,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; CI-NEXT: s_and_b32 s0, s0, 0xffff ; CI-NEXT: s_or_b32 s1, s1, s7 ; CI-NEXT: s_or_b32 s0, s0, s3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1131,9 +1131,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; VI-NEXT: s_lshl_b32 s3, s3, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: s_or_b32 s0, s0, s3 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1644,9 +1644,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s1, s1, s3 ; CI-NEXT: s_min_i32 s0, s0, s2 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1661,9 +1661,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s1, s1, s3 ; VI-NEXT: s_min_i32 s0, s0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -3430,9 +3430,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: v_mov_b32_e32 v2, s3 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_mov_b32_e32 v0, s9 ; CI-NEXT: v_mov_b32_e32 v1, s8 +; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: v_mov_b32_e32 v5, s1 @@ -3464,9 +3464,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -3774,11 +3774,11 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; CI-NEXT: s_or_b32 s2, s2, s6 ; CI-NEXT: s_or_b32 s1, s1, s5 ; CI-NEXT: s_or_b32 s0, s0, s4 -; CI-NEXT: v_mov_b32_e32 v4, s8 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_mov_b32_e32 v4, s8 ; CI-NEXT: v_mov_b32_e32 v5, s9 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm @@ -3823,11 +3823,11 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; VI-NEXT: s_min_u32 s5, s6, s5 ; VI-NEXT: s_lshl_b32 s0, s0, 16 ; VI-NEXT: s_or_b32 s0, s5, s0 -; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v5, s9 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -3932,8 +3932,8 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; CI-NEXT: s_and_b32 s2, s2, 0xffff ; CI-NEXT: s_and_b32 s3, s3, 0xffff ; CI-NEXT: s_min_u32 s2, s2, s3 -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -3950,8 +3950,8 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_and_b32 s3, s3, 0xffff ; VI-NEXT: s_min_u32 s2, s2, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -4062,8 +4062,8 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; CI-NEXT: s_sext_i32_i16 s2, s2 ; CI-NEXT: s_sext_i32_i16 s3, s3 ; CI-NEXT: s_min_i32 s2, s2, s3 -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -4080,8 +4080,8 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; VI-NEXT: s_sext_i32_i16 s2, s2 ; VI-NEXT: s_sext_i32_i16 s3, s3 ; VI-NEXT: s_min_i32 s2, s2, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll index 05ff5c8bb0b3a..8adfed45e2514 100644 --- a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll +++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll @@ -91,15 +91,15 @@ define amdgpu_kernel void @withcall() { ; GFX9-NEXT: s_add_u32 s4, s4, nonkernel@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, nonkernel@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[20:21] -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[22:23] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: ds_write_b32 v3, v3 offset:8 @@ -156,15 +156,15 @@ define amdgpu_kernel void @withcall() { ; G_GFX9-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 ; G_GFX9-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 ; G_GFX9-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x0 +; G_GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; G_GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; G_GFX9-NEXT: s_mov_b32 s14, s10 ; G_GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] ; G_GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] -; G_GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; G_GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; G_GFX9-NEXT: s_mov_b64 s[0:1], s[20:21] ; G_GFX9-NEXT: v_mov_b32_e32 v3, 0 ; G_GFX9-NEXT: v_mov_b32_e32 v4, 8 ; G_GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; G_GFX9-NEXT: s_mov_b64 s[0:1], s[20:21] ; G_GFX9-NEXT: s_mov_b64 s[2:3], s[22:23] ; G_GFX9-NEXT: s_mov_b64 s[4:5], s[12:13] ; G_GFX9-NEXT: s_mov_b32 s12, s16 diff --git a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll index aff07787a2fb7..79414e52eed9c 100644 --- a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll +++ b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll @@ -10,9 +10,9 @@ define protected amdgpu_kernel void @no_folding_imm_to_inst_with_fi(<4 x i64> %v ; CHECK-NEXT: s_load_b512 s[0:15], s[4:5], 0xa4 ; CHECK-NEXT: s_mov_b64 s[34:35], src_private_base ; CHECK-NEXT: s_movk_i32 s34, 0x80 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: v_dual_mov_b32 v27, s35 :: v_dual_mov_b32 v26, s34 ; CHECK-NEXT: s_add_nc_u64 s[44:45], s[34:35], 0x70 -; CHECK-NEXT: v_dual_mov_b32 v26, s34 :: v_dual_mov_b32 v27, s35 ; CHECK-NEXT: v_dual_mov_b32 v20, s44 :: v_dual_mov_b32 v21, s45 ; CHECK-NEXT: s_wait_kmcnt 0x0 ; CHECK-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v1, s41 diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index 9afaab5ebcfb6..0e55d1df4e9c3 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -278,8 +278,8 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_add_u32 s0, s4, 0x3039 @@ -300,9 +300,9 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out ; GFX8-NEXT: s_mov_b32 s9, 0xf237b ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_add_u32 s0, s4, 0x3039 @@ -361,12 +361,12 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) % ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_or_b32 s0, s2, 63 -; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_add_u32 s0, s8, 63 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_addc_u32 s1, s9, 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -383,12 +383,12 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) % ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, s0 ; GFX8-NEXT: s_or_b32 s0, s2, 63 -; GFX8-NEXT: s_mov_b32 s5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s0, s8, 63 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8-NEXT: s_mov_b32 s5, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll index 89bcfb3b3a834..d8d47ad3cedfd 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll @@ -41,8 +41,8 @@ define amdgpu_cs void @test_simple_indirect_call() { ; GFX10-NEXT: s_bitset0_b32 s11, 21 ; GFX10-NEXT: s_add_u32 s8, s8, s0 ; GFX10-NEXT: s_addc_u32 s9, s9, 0 -; GFX10-NEXT: s_mov_b64 s[0:1], s[8:9] ; GFX10-NEXT: s_mov_b64 s[2:3], s[10:11] +; GFX10-NEXT: s_mov_b64 s[0:1], s[8:9] ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/permute.ll b/llvm/test/CodeGen/AMDGPU/permute.ll index c98bcd53bec1a..ff3833245488b 100644 --- a/llvm/test/CodeGen/AMDGPU/permute.ll +++ b/llvm/test/CodeGen/AMDGPU/permute.ll @@ -292,10 +292,10 @@ define amdgpu_kernel void @known_ffff0500(ptr addrspace(1) nocapture %arg, i32 % ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v4, v[0:1] ; GCN-NEXT: s_bitset1_b32 s2, 15 -; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_and_b32 s0, s2, 0xff00 ; GCN-NEXT: s_or_b32 s0, s0, 0xffff0000 +; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_or_b32_e32 v4, 4, v4 ; GCN-NEXT: v_and_b32_e32 v4, 0xff00ff, v4 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 1156f2718cf1e..4f1ac5f6f4683 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -22,8 +22,8 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 @@ -357,8 +357,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 @@ -385,8 +385,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB1_2 Depth 2 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: s_mov_b32 s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: .LBB1_2: ; %for.body ; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1 @@ -1031,8 +1031,8 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 @@ -1348,8 +1348,8 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 @@ -1573,8 +1573,8 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 @@ -1781,8 +1781,8 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1, ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], s[48:49] ; GFX8-NEXT: s_mov_b64 s[2:3], s[50:51] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 @@ -2053,8 +2053,8 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 @@ -2391,8 +2391,8 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 ; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: v_mov_b32_e32 v31, v0 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 @@ -2561,9 +2561,9 @@ define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dword s1, s[4:5], 0xec ; GFX8-NEXT: s_add_u32 s0, 0, -1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_addc_u32 s1, s1, -1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_mov_b64 s[0:1], 0 diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll index 2e15c37bc19fb..5a0ab4ef7b42b 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll @@ -89,8 +89,8 @@ define amdgpu_kernel void @gep_as0_uniform(ptr %p, i64 %offset, ptr %ret) { ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_addc_u32 s1, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[2:3], v5 ; GFX8-NEXT: flat_store_dword v[0:1], v4 @@ -140,7 +140,7 @@ define amdgpu_kernel void @gep_as0_uniform(ptr %p, i64 %offset, ptr %ret) { ; GFX11-NEXT: s_add_u32 s0, s0, s2 ; GFX11-NEXT: s_addc_u32 s1, s1, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: flat_store_b64 v[0:1], v[2:3] ; GFX11-NEXT: s_endpgm ; @@ -246,8 +246,8 @@ define amdgpu_kernel void @multi_gep_as0_uniform(ptr %p, i64 %offset, ptr %ret) ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_addc_u32 s1, s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[2:3], v5 ; GFX8-NEXT: flat_store_dword v[0:1], v4 @@ -303,7 +303,7 @@ define amdgpu_kernel void @multi_gep_as0_uniform(ptr %p, i64 %offset, ptr %ret) ; GFX11-NEXT: s_add_u32 s0, s0, 5 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: flat_store_b64 v[0:1], v[2:3] ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir index 4a0bb6ceccd3f..09526ea5ac878 100644 --- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir +++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir @@ -50,7 +50,7 @@ body: | ; CHECK-NEXT: renamable $sgpr56 = S_MOV_B32 0 ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, implicit $exec + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, 12884901888, implicit $exec ; CHECK-NEXT: renamable $sgpr100_sgpr101 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: renamable $sgpr57 = S_MOV_B32 1083786240 ; CHECK-NEXT: SI_SPILL_S1024_SAVE renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5) @@ -221,7 +221,7 @@ body: | ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: $sgpr8_sgpr9 = COPY renamable $sgpr82_sgpr83 + ; CHECK-NEXT: $sgpr8_sgpr9 = COPY renamable $sgpr82_sgpr83, 3 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr8_sgpr9 ; CHECK-NEXT: renamable $sgpr18_sgpr19 = COPY killed renamable $sgpr48_sgpr49 ; CHECK-NEXT: renamable $sgpr14 = COPY killed renamable $sgpr85 diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index b761f689d6af5..70992be391054 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -184,9 +184,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v19, v25, v27 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] ; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX9-NEXT: v_mov_b32_e32 v19, v9 ; GFX9-NEXT: v_or3_b32 v7, v7, 0, v13 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v19, v9 ; GFX9-NEXT: v_mov_b32_e32 v18, v8 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB0_3 @@ -1547,9 +1547,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v21, v23, v25 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] ; GFX9-NEXT: v_and_b32_e32 v12, 1, v30 -; GFX9-NEXT: v_mov_b32_e32 v21, v13 ; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v21, v13 ; GFX9-NEXT: v_mov_b32_e32 v20, v12 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB1_3 @@ -1624,8 +1624,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: s_waitcnt vmcnt(6) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.mir b/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.mir index 7d11c2deb6658..628560cccf71c 100644 --- a/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.mir +++ b/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: not llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -start-before=greedy,2 -filetype=null %s 2>&1 | FileCheck %s # This testcase fails register allocation at the same time it performs diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll index fc154604b8700..ed38673a16283 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll @@ -96,22 +96,29 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi_loop(ptr addrspace ; CHECK-NEXT: v_mov_b32_e32 v65, 2.0 ; CHECK-NEXT: .LBB1_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(7) ; CHECK-NEXT: s_nop 15 ; CHECK-NEXT: v_mov_b64_e32 v[62:63], v[30:31] ; CHECK-NEXT: v_mov_b64_e32 v[60:61], v[28:29] +; CHECK-NEXT: s_waitcnt vmcnt(6) ; CHECK-NEXT: v_mov_b64_e32 v[58:59], v[26:27] ; CHECK-NEXT: v_mov_b64_e32 v[56:57], v[24:25] +; CHECK-NEXT: s_waitcnt vmcnt(5) ; CHECK-NEXT: v_mov_b64_e32 v[54:55], v[22:23] ; CHECK-NEXT: v_mov_b64_e32 v[52:53], v[20:21] +; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: v_mov_b64_e32 v[50:51], v[18:19] ; CHECK-NEXT: v_mov_b64_e32 v[48:49], v[16:17] +; CHECK-NEXT: s_waitcnt vmcnt(3) ; CHECK-NEXT: v_mov_b64_e32 v[46:47], v[14:15] ; CHECK-NEXT: v_mov_b64_e32 v[44:45], v[12:13] +; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: v_mov_b64_e32 v[42:43], v[10:11] ; CHECK-NEXT: v_mov_b64_e32 v[40:41], v[8:9] +; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: v_mov_b64_e32 v[38:39], v[6:7] ; CHECK-NEXT: v_mov_b64_e32 v[36:37], v[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mov_b64_e32 v[34:35], v[2:3] ; CHECK-NEXT: v_mov_b64_e32 v[32:33], v[0:1] ; CHECK-NEXT: s_add_i32 s1, s1, 1 diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index e29be2b744874..6108a5550747c 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -592,6 +592,8 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: v_accvgpr_read_b32 v0, a32 ; CHECK-NEXT: s_nop 15 ; CHECK-NEXT: v_mov_b64_e32 v[2:3], v[32:33] +; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[60:61] +; CHECK-NEXT: v_mov_b64_e32 v[32:33], v[62:63] ; CHECK-NEXT: v_mov_b64_e32 v[4:5], v[34:35] ; CHECK-NEXT: v_mov_b64_e32 v[6:7], v[36:37] ; CHECK-NEXT: v_mov_b64_e32 v[8:9], v[38:39] @@ -605,8 +607,6 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: v_mov_b64_e32 v[24:25], v[54:55] ; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[56:57] ; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[58:59] -; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[60:61] -; CHECK-NEXT: v_mov_b64_e32 v[32:33], v[62:63] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v0, v[30:33], s[0:1] offset:112 ; CHECK-NEXT: global_store_dwordx4 v0, v[26:29], s[0:1] offset:96 diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index a7fcb6439703a..210c31d44418c 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -6,6 +6,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s + define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; R600-LABEL: rotl_i32: ; R600: ; %bb.0: ; %entry @@ -137,9 +138,9 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX8-NEXT: s_and_b32 s3, s6, 31 ; GFX8-NEXT: s_mov_b32 s1, s0 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -272,11 +273,11 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX8-NEXT: s_and_b32 s3, s12, 31 ; GFX8-NEXT: s_mov_b32 s9, s8 ; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s3 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index 71c7797cbc68e..bec261cd97e26 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -6,6 +6,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s + define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; R600-LABEL: rotr_i32: ; R600: ; %bb.0: ; %entry @@ -122,9 +123,9 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX8-NEXT: s_mov_b32 s1, s0 ; GFX8-NEXT: s_lshr_b64 s[2:3], s[6:7], s3 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s8 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -237,11 +238,11 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX8-NEXT: s_mov_b32 s9, s8 ; GFX8-NEXT: s_lshr_b64 s[4:5], s[10:11], s5 ; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s3 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -309,227 +310,6 @@ entry: ret void } -define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i32> %y) { -; R600-LABEL: rotr_v8i32: -; R600: ; %bb.0: ; %entry -; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0 -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 -; R600-NEXT: CF_END -; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[5].X, KC0[5].X, KC0[7].X, -; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[4].W, KC0[4].W, KC0[6].W, -; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[4].Z, KC0[4].Z, KC0[6].Z, -; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[4].Y, KC0[4].Y, KC0[6].Y, -; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; R600-NEXT: BIT_ALIGN_INT * T2.W, KC0[6].X, KC0[6].X, KC0[8].X, -; R600-NEXT: BIT_ALIGN_INT * T2.Z, KC0[5].W, KC0[5].W, KC0[7].W, -; R600-NEXT: BIT_ALIGN_INT * T2.Y, KC0[5].Z, KC0[5].Z, KC0[7].Z, -; R600-NEXT: BIT_ALIGN_INT * T2.X, KC0[5].Y, KC0[5].Y, KC0[7].Y, -; R600-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, -; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; R600-NEXT: LSHR * T3.X, PV.W, literal.x, -; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; -; SI-LABEL: rotr_v8i32: -; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s24, s19, 31 -; SI-NEXT: s_mov_b32 s4, s11 -; SI-NEXT: s_mov_b32 s5, s11 -; SI-NEXT: s_and_b32 s25, s18, 31 -; SI-NEXT: s_mov_b32 s11, s10 -; SI-NEXT: s_and_b32 s26, s17, 31 -; SI-NEXT: s_mov_b32 s6, s9 -; SI-NEXT: s_mov_b32 s7, s9 -; SI-NEXT: s_and_b32 s27, s16, 31 -; SI-NEXT: s_mov_b32 s9, s8 -; SI-NEXT: s_and_b32 s23, s23, 31 -; SI-NEXT: s_mov_b32 s16, s15 -; SI-NEXT: s_mov_b32 s17, s15 -; SI-NEXT: s_and_b32 s22, s22, 31 -; SI-NEXT: s_mov_b32 s15, s14 -; SI-NEXT: s_and_b32 s21, s21, 31 -; SI-NEXT: s_mov_b32 s18, s13 -; SI-NEXT: s_mov_b32 s19, s13 -; SI-NEXT: s_and_b32 s20, s20, 31 -; SI-NEXT: s_mov_b32 s13, s12 -; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s24 -; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s25 -; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s26 -; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], s23 -; SI-NEXT: s_lshr_b64 s[14:15], s[14:15], s22 -; SI-NEXT: s_lshr_b64 s[18:19], s[18:19], s21 -; SI-NEXT: s_lshr_b64 s[12:13], s[12:13], s20 -; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s27 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_mov_b32_e32 v1, s18 -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: v_mov_b32_e32 v3, s16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: s_endpgm -; -; GFX8-LABEL: rotr_v8i32: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s4, s19, 31 -; GFX8-NEXT: s_mov_b32 s2, s11 -; GFX8-NEXT: s_mov_b32 s3, s11 -; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 -; GFX8-NEXT: s_and_b32 s3, s17, 31 -; GFX8-NEXT: s_mov_b32 s6, s9 -; GFX8-NEXT: s_mov_b32 s7, s9 -; GFX8-NEXT: s_and_b32 s5, s18, 31 -; GFX8-NEXT: s_mov_b32 s11, s10 -; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s3 -; GFX8-NEXT: s_and_b32 s3, s16, 31 -; GFX8-NEXT: s_mov_b32 s9, s8 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[10:11], s5 -; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s3 -; GFX8-NEXT: s_and_b32 s3, s23, 31 -; GFX8-NEXT: s_mov_b32 s10, s15 -; GFX8-NEXT: s_mov_b32 s11, s15 -; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s3 -; GFX8-NEXT: s_and_b32 s3, s22, 31 -; GFX8-NEXT: s_mov_b32 s15, s14 -; GFX8-NEXT: s_lshr_b64 s[14:15], s[14:15], s3 -; GFX8-NEXT: s_and_b32 s3, s21, 31 -; GFX8-NEXT: s_mov_b32 s16, s13 -; GFX8-NEXT: s_mov_b32 s17, s13 -; GFX8-NEXT: s_lshr_b64 s[16:17], s[16:17], s3 -; GFX8-NEXT: s_and_b32 s3, s20, 31 -; GFX8-NEXT: s_mov_b32 s13, s12 -; GFX8-NEXT: s_lshr_b64 s[12:13], s[12:13], s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: s_add_u32 s10, s0, 16 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NEXT: v_mov_b32_e32 v1, s16 -; GFX8-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_endpgm -; -; GFX10-LABEL: rotr_v8i32: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s19, s19, 31 -; GFX10-NEXT: s_mov_b32 s2, s11 -; GFX10-NEXT: s_mov_b32 s3, s11 -; GFX10-NEXT: s_and_b32 s17, s17, 31 -; GFX10-NEXT: s_mov_b32 s4, s9 -; GFX10-NEXT: s_mov_b32 s5, s9 -; GFX10-NEXT: s_and_b32 s16, s16, 31 -; GFX10-NEXT: s_mov_b32 s9, s8 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s19 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s17 -; GFX10-NEXT: s_and_b32 s23, s23, 31 -; GFX10-NEXT: s_mov_b32 s6, s15 -; GFX10-NEXT: s_mov_b32 s7, s15 -; GFX10-NEXT: s_and_b32 s22, s22, 31 -; GFX10-NEXT: s_mov_b32 s15, s14 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s16 -; GFX10-NEXT: s_mov_b32 s16, s13 -; GFX10-NEXT: s_mov_b32 s17, s13 -; GFX10-NEXT: s_and_b32 s3, s20, 31 -; GFX10-NEXT: s_mov_b32 s13, s12 -; GFX10-NEXT: s_and_b32 s5, s21, 31 -; GFX10-NEXT: s_and_b32 s18, s18, 31 -; GFX10-NEXT: s_mov_b32 s11, s10 -; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s23 -; GFX10-NEXT: s_lshr_b64 s[14:15], s[14:15], s22 -; GFX10-NEXT: s_lshr_b64 s[12:13], s[12:13], s3 -; GFX10-NEXT: s_lshr_b64 s[16:17], s[16:17], s5 -; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 -; GFX10-NEXT: v_mov_b32_e32 v0, s12 -; GFX10-NEXT: v_mov_b32_e32 v1, s16 -; GFX10-NEXT: v_mov_b32_e32 v2, s14 -; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: v_mov_b32_e32 v4, s8 -; GFX10-NEXT: v_mov_b32_e32 v5, s4 -; GFX10-NEXT: v_mov_b32_e32 v6, s10 -; GFX10-NEXT: v_mov_b32_e32 v7, s2 -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: rotr_v8i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x44 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s19, s19, 31 -; GFX11-NEXT: s_mov_b32 s2, s11 -; GFX11-NEXT: s_mov_b32 s3, s11 -; GFX11-NEXT: s_and_b32 s17, s17, 31 -; GFX11-NEXT: s_mov_b32 s4, s9 -; GFX11-NEXT: s_mov_b32 s5, s9 -; GFX11-NEXT: s_and_b32 s16, s16, 31 -; GFX11-NEXT: s_mov_b32 s9, s8 -; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s19 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s17 -; GFX11-NEXT: s_and_b32 s23, s23, 31 -; GFX11-NEXT: s_mov_b32 s6, s15 -; GFX11-NEXT: s_mov_b32 s7, s15 -; GFX11-NEXT: s_and_b32 s22, s22, 31 -; GFX11-NEXT: s_mov_b32 s15, s14 -; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s16 -; GFX11-NEXT: s_mov_b32 s16, s13 -; GFX11-NEXT: s_mov_b32 s17, s13 -; GFX11-NEXT: s_and_b32 s3, s20, 31 -; GFX11-NEXT: s_mov_b32 s13, s12 -; GFX11-NEXT: s_and_b32 s5, s21, 31 -; GFX11-NEXT: s_and_b32 s18, s18, 31 -; GFX11-NEXT: s_mov_b32 s11, s10 -; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s23 -; GFX11-NEXT: s_lshr_b64 s[14:15], s[14:15], s22 -; GFX11-NEXT: s_lshr_b64 s[12:13], s[12:13], s3 -; GFX11-NEXT: s_lshr_b64 s[16:17], s[16:17], s5 -; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 -; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s16 -; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s6 -; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s4 -; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s2 -; GFX11-NEXT: v_mov_b32_e32 v6, s10 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 -; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] -; GFX11-NEXT: s_endpgm -entry: - %tmp0 = sub <8 x i32> , %y - %tmp1 = shl <8 x i32> %x, %tmp0 - %tmp2 = lshr <8 x i32> %x, %y - %tmp3 = or <8 x i32> %tmp1, %tmp2 - store <8 x i32> %tmp3, ptr addrspace(1) %in - ret void -} - declare i16 @llvm.fshr.i16(i16, i16, i16) define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr addrspace(1) nocapture readonly %sourceB, ptr addrspace(1) nocapture %destValues) { diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll index 15fc987d1e7c6..3a08a6e38f493 100644 --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -earlycse-debug-hash < %s | FileCheck -check-prefix=GCN %s + define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_pat1: ; GCN: ; %bb.0: @@ -35,13 +36,13 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s2, s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GCN-NEXT: v_mov_b32_e32 v0, 0x5a ; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: v_mov_b32_e32 v0, 0x5a ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_sad_u32 v2, s2, v0, 20 ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -137,9 +138,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_mov_b32_e32 v3, s2 -; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_addc_u32 s21, s21, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_sad_u32 v2, s0, v2, v3 ; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen @@ -175,11 +176,11 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 ; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_dword v[0:1], v3 ; GCN-NEXT: s_endpgm @@ -213,11 +214,11 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 ; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_dword v[0:1], v3 ; GCN-NEXT: s_endpgm @@ -252,11 +253,11 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_sad_u32 v3, s1, v0, v1 ; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_dword v[0:1], v3 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll index 8861b7726a4c5..a98c04f9d11dd 100644 --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -351,14 +351,14 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_u32 s12, s4, s6 -; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_addc_u32 s13, s5, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] ; SI-NEXT: v_cmp_lt_i64_e64 s[4:5], s[6:7], 0 -; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; SI-NEXT: s_mov_b32 s0, s2 @@ -377,9 +377,9 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s4, s6 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_addc_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -437,8 +437,8 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_addc_u32 s9, s5, s7 ; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0 ; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5] -; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: s_xor_b32 s4, s6, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index fdb20f372ab8d..c73241aa226ac 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GCN-IR %s + define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv: ; GCN: ; %bb.0: @@ -154,8 +155,8 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: s_sub_u32 s4, s4, s6 ; GCN-NEXT: s_subb_u32 s5, s5, s7 -; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -239,9 +240,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[10:11], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s4, s6, s4 ; GCN-IR-NEXT: s_subb_u32 s5, s7, s5 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm @@ -452,8 +453,8 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v16 ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 @@ -1327,9 +1328,9 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s4, s6, s4 ; GCN-IR-NEXT: s_subb_u32 s5, s7, s5 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm @@ -1510,8 +1511,8 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v14 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3 @@ -1703,8 +1704,8 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v14 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 @@ -1797,10 +1798,10 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 ; GCN-IR-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v10 -; GCN-IR-NEXT: v_or_b32_e32 v1, v7, v1 ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 +; GCN-IR-NEXT: v_or_b32_e32 v1, v7, v1 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v6, v2 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 19f0e93c308d8..b539fb548d640 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -2107,8 +2107,8 @@ define void @crash_lshlrevb16_not_reg_op() { ; NOSDWA-NEXT: .LBB22_1: ; %bb1 ; NOSDWA-NEXT: ; =>This Inner Loop Header: Depth=1 ; NOSDWA-NEXT: s_lshl_b32 s7, s4, 3 -; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 ; NOSDWA-NEXT: s_lshr_b32 s7, s6, s7 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 ; NOSDWA-NEXT: s_mov_b64 s[4:5], 1 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s7 @@ -2129,8 +2129,8 @@ define void @crash_lshlrevb16_not_reg_op() { ; GFX89-NEXT: .LBB22_1: ; %bb1 ; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX89-NEXT: s_lshl_b32 s7, s4, 3 -; GFX89-NEXT: v_mov_b32_e32 v0, s4 ; GFX89-NEXT: s_lshr_b32 s7, s6, s7 +; GFX89-NEXT: v_mov_b32_e32 v0, s4 ; GFX89-NEXT: v_mov_b32_e32 v1, s5 ; GFX89-NEXT: s_mov_b64 s[4:5], 1 ; GFX89-NEXT: v_mov_b32_e32 v2, s7 @@ -2151,8 +2151,8 @@ define void @crash_lshlrevb16_not_reg_op() { ; GFX9-NEXT: .LBB22_1: ; %bb1 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_lshl_b32 s7, s4, 3 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_lshr_b32 s7, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_mov_b64 s[4:5], 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s7 @@ -2174,8 +2174,8 @@ define void @crash_lshlrevb16_not_reg_op() { ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_lshl_b32 s7, s4, 3 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_lshr_b32 s4, s6, s7 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: s_mov_b64 s[4:5], 1 ; GFX10-NEXT: flat_store_byte v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir index f11fe4aa6e00e..2cb85bb3fb3b7 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir @@ -75,8 +75,8 @@ body: | ; GFX9-LABEL: name: sgpr96_aligned_src_dst ; GFX9: liveins: $sgpr0_sgpr1_sgpr2 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr6 = S_MOV_B32 $sgpr2, implicit $sgpr0_sgpr1_sgpr2, implicit-def $sgpr4_sgpr5_sgpr6 - ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2 + ; GFX9-NEXT: $sgpr6 = S_MOV_B32 $sgpr2 + ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr0_sgpr1 $sgpr4_sgpr5_sgpr6 = COPY $sgpr0_sgpr1_sgpr2 ... @@ -88,8 +88,8 @@ body: | ; GFX9-LABEL: name: sgpr96_killed ; GFX9: liveins: $sgpr4_sgpr5_sgpr6 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr10 = S_MOV_B32 $sgpr6, implicit $sgpr4_sgpr5_sgpr6, implicit-def $sgpr8_sgpr9_sgpr10 - ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr4_sgpr5, implicit killed $sgpr4_sgpr5_sgpr6 + ; GFX9-NEXT: $sgpr10 = S_MOV_B32 killed $sgpr6 + ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 killed $sgpr4_sgpr5 $sgpr8_sgpr9_sgpr10 = COPY killed $sgpr4_sgpr5_sgpr6 ... @@ -101,8 +101,8 @@ body: | ; GFX9-LABEL: name: sgpr128_forward ; GFX9: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr6_sgpr7 $sgpr0_sgpr1_sgpr2_sgpr3 = COPY $sgpr4_sgpr5_sgpr6_sgpr7 ... @@ -114,8 +114,8 @@ body: | ; GFX9-LABEL: name: sgpr128_backward ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr0_sgpr1 $sgpr4_sgpr5_sgpr6_sgpr7 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ... @@ -127,8 +127,8 @@ body: | ; GFX9-LABEL: name: sgpr128_killed ; GFX9: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr6_sgpr7, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 killed $sgpr6_sgpr7 $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed $sgpr4_sgpr5_sgpr6_sgpr7 ... @@ -140,9 +140,9 @@ body: | ; GFX9-LABEL: name: sgpr160_forward ; GFX9: liveins: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 - ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; GFX9-NEXT: $sgpr4 = S_MOV_B32 $sgpr12, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9 + ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11 + ; GFX9-NEXT: $sgpr4 = S_MOV_B32 $sgpr12 $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 = COPY $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 ... @@ -154,9 +154,9 @@ body: | ; GFX9-LABEL: name: sgpr160_backward ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr12 = S_MOV_B32 $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 - ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 + ; GFX9-NEXT: $sgpr12 = S_MOV_B32 $sgpr4 + ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1 $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 ... @@ -168,9 +168,9 @@ body: | ; GFX9-LABEL: name: sgpr160_killed ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr12 = S_MOV_B32 $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 - ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 + ; GFX9-NEXT: $sgpr12 = S_MOV_B32 killed $sgpr4 + ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 killed $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 killed $sgpr0_sgpr1 $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 ... @@ -183,9 +183,9 @@ body: | ; GFX9-LABEL: name: sgpr192_forward ; GFX9: liveins: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9 + ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11 + ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr12_sgpr13 $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 = COPY $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ... @@ -197,9 +197,9 @@ body: | ; GFX9-LABEL: name: sgpr192_backward ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1 $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ... @@ -211,9 +211,9 @@ body: | ; GFX9-LABEL: name: sgpr192_killed ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 killed $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 killed $sgpr0_sgpr1 $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ... @@ -225,10 +225,10 @@ body: | ; GFX9-LABEL: name: sgpr256_forward ; GFX9: liveins: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9 + ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11 + ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr12_sgpr13 + ; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr14_sgpr15 $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ... @@ -240,10 +240,10 @@ body: | ; GFX9-LABEL: name: sgpr256_backward ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 $sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1 $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ... @@ -255,10 +255,10 @@ body: | ; GFX9-LABEL: name: sgpr256_killed ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 killed $sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 killed $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 killed $sgpr0_sgpr1 $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ... @@ -270,14 +270,14 @@ body: | ; GFX9-LABEL: name: sgpr512_forward ; GFX9: liveins: $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr16_sgpr17, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr18_sgpr19, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr20_sgpr21, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr22_sgpr23, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr24_sgpr25, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr26_sgpr27, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr28_sgpr29, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 $sgpr30_sgpr31, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 + ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr16_sgpr17 + ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr18_sgpr19 + ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr20_sgpr21 + ; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr22_sgpr23 + ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr24_sgpr25 + ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr26_sgpr27 + ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr28_sgpr29 + ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 $sgpr30_sgpr31 $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 ... @@ -289,14 +289,14 @@ body: | ; GFX9-LABEL: name: sgpr512_backward ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr30_sgpr31 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr28_sgpr29 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr26_sgpr27 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr24_sgpr25 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr22_sgpr23 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr20_sgpr21 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr18_sgpr19 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr16_sgpr17 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX9-NEXT: $sgpr30_sgpr31 = S_MOV_B64 $sgpr14_sgpr15 + ; GFX9-NEXT: $sgpr28_sgpr29 = S_MOV_B64 $sgpr12_sgpr13 + ; GFX9-NEXT: $sgpr26_sgpr27 = S_MOV_B64 $sgpr10_sgpr11 + ; GFX9-NEXT: $sgpr24_sgpr25 = S_MOV_B64 $sgpr8_sgpr9 + ; GFX9-NEXT: $sgpr22_sgpr23 = S_MOV_B64 $sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr20_sgpr21 = S_MOV_B64 $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr18_sgpr19 = S_MOV_B64 $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr16_sgpr17 = S_MOV_B64 $sgpr0_sgpr1 $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ... @@ -308,14 +308,14 @@ body: | ; GFX9-LABEL: name: sgpr512_killed ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr30_sgpr31 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr28_sgpr29 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr26_sgpr27 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr24_sgpr25 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr22_sgpr23 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr20_sgpr21 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr18_sgpr19 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9-NEXT: $sgpr16_sgpr17 = S_MOV_B64 $sgpr0_sgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX9-NEXT: $sgpr30_sgpr31 = S_MOV_B64 killed $sgpr14_sgpr15 + ; GFX9-NEXT: $sgpr28_sgpr29 = S_MOV_B64 killed $sgpr12_sgpr13 + ; GFX9-NEXT: $sgpr26_sgpr27 = S_MOV_B64 killed $sgpr10_sgpr11 + ; GFX9-NEXT: $sgpr24_sgpr25 = S_MOV_B64 killed $sgpr8_sgpr9 + ; GFX9-NEXT: $sgpr22_sgpr23 = S_MOV_B64 killed $sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr20_sgpr21 = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr18_sgpr19 = S_MOV_B64 killed $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr16_sgpr17 = S_MOV_B64 killed $sgpr0_sgpr1 $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ... @@ -327,22 +327,22 @@ body: | ; GFX9-LABEL: name: sgpr1024_forward ; GFX9: liveins: $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr32_sgpr33, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr34_sgpr35, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr36_sgpr37, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr38_sgpr39, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr40_sgpr41, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr42_sgpr43, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr44_sgpr45, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 $sgpr46_sgpr47, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr16_sgpr17 = S_MOV_B64 $sgpr48_sgpr49, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr18_sgpr19 = S_MOV_B64 $sgpr50_sgpr51, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr20_sgpr21 = S_MOV_B64 $sgpr52_sgpr53, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr22_sgpr23 = S_MOV_B64 $sgpr54_sgpr55, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr24_sgpr25 = S_MOV_B64 $sgpr56_sgpr57, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr26_sgpr27 = S_MOV_B64 $sgpr58_sgpr59, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr28_sgpr29 = S_MOV_B64 $sgpr60_sgpr61, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr30_sgpr31 = S_MOV_B64 $sgpr62_sgpr63, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 + ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr32_sgpr33 + ; GFX9-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr34_sgpr35 + ; GFX9-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr36_sgpr37 + ; GFX9-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr38_sgpr39 + ; GFX9-NEXT: $sgpr8_sgpr9 = S_MOV_B64 $sgpr40_sgpr41 + ; GFX9-NEXT: $sgpr10_sgpr11 = S_MOV_B64 $sgpr42_sgpr43 + ; GFX9-NEXT: $sgpr12_sgpr13 = S_MOV_B64 $sgpr44_sgpr45 + ; GFX9-NEXT: $sgpr14_sgpr15 = S_MOV_B64 $sgpr46_sgpr47 + ; GFX9-NEXT: $sgpr16_sgpr17 = S_MOV_B64 $sgpr48_sgpr49 + ; GFX9-NEXT: $sgpr18_sgpr19 = S_MOV_B64 $sgpr50_sgpr51 + ; GFX9-NEXT: $sgpr20_sgpr21 = S_MOV_B64 $sgpr52_sgpr53 + ; GFX9-NEXT: $sgpr22_sgpr23 = S_MOV_B64 $sgpr54_sgpr55 + ; GFX9-NEXT: $sgpr24_sgpr25 = S_MOV_B64 $sgpr56_sgpr57 + ; GFX9-NEXT: $sgpr26_sgpr27 = S_MOV_B64 $sgpr58_sgpr59 + ; GFX9-NEXT: $sgpr28_sgpr29 = S_MOV_B64 $sgpr60_sgpr61 + ; GFX9-NEXT: $sgpr30_sgpr31 = S_MOV_B64 $sgpr62_sgpr63 $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = COPY $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 ... @@ -354,22 +354,22 @@ body: | ; GFX9-LABEL: name: sgpr1024_backward ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr62_sgpr63 = S_MOV_B64 $sgpr30_sgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr60_sgpr61 = S_MOV_B64 $sgpr28_sgpr29, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr58_sgpr59 = S_MOV_B64 $sgpr26_sgpr27, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr56_sgpr57 = S_MOV_B64 $sgpr24_sgpr25, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr54_sgpr55 = S_MOV_B64 $sgpr22_sgpr23, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr52_sgpr53 = S_MOV_B64 $sgpr20_sgpr21, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr50_sgpr51 = S_MOV_B64 $sgpr18_sgpr19, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr48_sgpr49 = S_MOV_B64 $sgpr16_sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr46_sgpr47 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr44_sgpr45 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr42_sgpr43 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr40_sgpr41 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr38_sgpr39 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr36_sgpr37 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr34_sgpr35 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr32_sgpr33 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 + ; GFX9-NEXT: $sgpr62_sgpr63 = S_MOV_B64 $sgpr30_sgpr31 + ; GFX9-NEXT: $sgpr60_sgpr61 = S_MOV_B64 $sgpr28_sgpr29 + ; GFX9-NEXT: $sgpr58_sgpr59 = S_MOV_B64 $sgpr26_sgpr27 + ; GFX9-NEXT: $sgpr56_sgpr57 = S_MOV_B64 $sgpr24_sgpr25 + ; GFX9-NEXT: $sgpr54_sgpr55 = S_MOV_B64 $sgpr22_sgpr23 + ; GFX9-NEXT: $sgpr52_sgpr53 = S_MOV_B64 $sgpr20_sgpr21 + ; GFX9-NEXT: $sgpr50_sgpr51 = S_MOV_B64 $sgpr18_sgpr19 + ; GFX9-NEXT: $sgpr48_sgpr49 = S_MOV_B64 $sgpr16_sgpr17 + ; GFX9-NEXT: $sgpr46_sgpr47 = S_MOV_B64 $sgpr14_sgpr15 + ; GFX9-NEXT: $sgpr44_sgpr45 = S_MOV_B64 $sgpr12_sgpr13 + ; GFX9-NEXT: $sgpr42_sgpr43 = S_MOV_B64 $sgpr10_sgpr11 + ; GFX9-NEXT: $sgpr40_sgpr41 = S_MOV_B64 $sgpr8_sgpr9 + ; GFX9-NEXT: $sgpr38_sgpr39 = S_MOV_B64 $sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr36_sgpr37 = S_MOV_B64 $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr34_sgpr35 = S_MOV_B64 $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr32_sgpr33 = S_MOV_B64 $sgpr0_sgpr1 $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 ... @@ -381,21 +381,21 @@ body: | ; GFX9-LABEL: name: sgpr1024_killed ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: $sgpr62_sgpr63 = S_MOV_B64 $sgpr30_sgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9-NEXT: $sgpr60_sgpr61 = S_MOV_B64 $sgpr28_sgpr29, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr58_sgpr59 = S_MOV_B64 $sgpr26_sgpr27, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr56_sgpr57 = S_MOV_B64 $sgpr24_sgpr25, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr54_sgpr55 = S_MOV_B64 $sgpr22_sgpr23, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr52_sgpr53 = S_MOV_B64 $sgpr20_sgpr21, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr50_sgpr51 = S_MOV_B64 $sgpr18_sgpr19, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr48_sgpr49 = S_MOV_B64 $sgpr16_sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr46_sgpr47 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr44_sgpr45 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr42_sgpr43 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr40_sgpr41 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr38_sgpr39 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr36_sgpr37 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr34_sgpr35 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9-NEXT: $sgpr32_sgpr33 = S_MOV_B64 $sgpr0_sgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 + ; GFX9-NEXT: $sgpr62_sgpr63 = S_MOV_B64 killed $sgpr30_sgpr31 + ; GFX9-NEXT: $sgpr60_sgpr61 = S_MOV_B64 killed $sgpr28_sgpr29 + ; GFX9-NEXT: $sgpr58_sgpr59 = S_MOV_B64 killed $sgpr26_sgpr27 + ; GFX9-NEXT: $sgpr56_sgpr57 = S_MOV_B64 killed $sgpr24_sgpr25 + ; GFX9-NEXT: $sgpr54_sgpr55 = S_MOV_B64 killed $sgpr22_sgpr23 + ; GFX9-NEXT: $sgpr52_sgpr53 = S_MOV_B64 killed $sgpr20_sgpr21 + ; GFX9-NEXT: $sgpr50_sgpr51 = S_MOV_B64 killed $sgpr18_sgpr19 + ; GFX9-NEXT: $sgpr48_sgpr49 = S_MOV_B64 killed $sgpr16_sgpr17 + ; GFX9-NEXT: $sgpr46_sgpr47 = S_MOV_B64 killed $sgpr14_sgpr15 + ; GFX9-NEXT: $sgpr44_sgpr45 = S_MOV_B64 killed $sgpr12_sgpr13 + ; GFX9-NEXT: $sgpr42_sgpr43 = S_MOV_B64 killed $sgpr10_sgpr11 + ; GFX9-NEXT: $sgpr40_sgpr41 = S_MOV_B64 killed $sgpr8_sgpr9 + ; GFX9-NEXT: $sgpr38_sgpr39 = S_MOV_B64 killed $sgpr6_sgpr7 + ; GFX9-NEXT: $sgpr36_sgpr37 = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX9-NEXT: $sgpr34_sgpr35 = S_MOV_B64 killed $sgpr2_sgpr3 + ; GFX9-NEXT: $sgpr32_sgpr33 = S_MOV_B64 killed $sgpr0_sgpr1 $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 ... diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll index 076fff7612428..fe7f1d85acfa0 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll @@ -29,13 +29,13 @@ define amdgpu_kernel void @kernel() { ; GCN-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-NEXT: s_mov_b32 s14, s10 ; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: s_mov_b64 s[0:1], s[36:37] ; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 +; GCN-NEXT: s_mov_b64 s[0:1], s[36:37] ; GCN-NEXT: s_mov_b64 s[2:3], s[38:39] ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll index 2b698d3ee4854..8eb09d6884351 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -8,11 +8,11 @@ define amdgpu_kernel void @v_uextract_bit_31_i128(ptr addrspace(1) %out, ptr add ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 4 -; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: buffer_load_dword v0, v[4:5], s[8:11], 0 addr64 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -73,10 +73,10 @@ define amdgpu_kernel void @v_uextract_bit_95_i128(ptr addrspace(1) %out, ptr add ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 4 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] ; GCN-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: buffer_load_dword v0, v[4:5], s[8:11], 0 addr64 offset:8 ; GCN-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll index 760a126afa995..f7c8320739ef1 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -11,11 +11,11 @@ define amdgpu_kernel void @v_uextract_bit_31_i64(ptr addrspace(1) %out, ptr addr ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] @@ -69,11 +69,11 @@ define amdgpu_kernel void @v_uextract_bit_1_i64(ptr addrspace(1) %out, ptr addrs ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] @@ -98,11 +98,11 @@ define amdgpu_kernel void @v_uextract_bit_20_i64(ptr addrspace(1) %out, ptr addr ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] @@ -183,11 +183,11 @@ define amdgpu_kernel void @v_uextract_bit_20_21_i64(ptr addrspace(1) %out, ptr a ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] @@ -212,11 +212,11 @@ define amdgpu_kernel void @v_uextract_bit_1_30_i64(ptr addrspace(1) %out, ptr ad ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] @@ -241,11 +241,11 @@ define amdgpu_kernel void @v_uextract_bit_1_31_i64(ptr addrspace(1) %out, ptr ad ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] @@ -272,11 +272,11 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64(ptr addrspace(1) %out, ptr a ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 ; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll index 1c2215d39dc02..37b5060ce7566 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll @@ -21064,10 +21064,10 @@ define void @s_shuffle_v2i64_v8i64__12_6() { ; GFX900-NEXT: v_writelane_b32 v0, s50, 6 ; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s46, s16 ; GFX900-NEXT: s_mov_b32 s47, s17 @@ -21105,10 +21105,10 @@ define void @s_shuffle_v2i64_v8i64__12_6() { ; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 ; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s46, s16 ; GFX90A-NEXT: s_mov_b32 s47, s17 @@ -21288,10 +21288,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() { ; GFX900-NEXT: v_writelane_b32 v0, s50, 6 ; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s50, s16 ; GFX900-NEXT: s_mov_b32 s51, s17 @@ -21329,10 +21329,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() { ; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 ; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s50, s16 ; GFX90A-NEXT: s_mov_b32 s51, s17 @@ -21364,10 +21364,10 @@ define void @s_shuffle_v2i64_v8i64__14_6() { ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s30, s12 ; GFX942-NEXT: s_mov_b32 s31, s13 @@ -22124,10 +22124,10 @@ define void @s_shuffle_v2i64_v8i64__12_7() { ; GFX900-NEXT: v_writelane_b32 v0, s50, 6 ; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s46, s18 ; GFX900-NEXT: s_mov_b32 s47, s19 @@ -22165,10 +22165,10 @@ define void @s_shuffle_v2i64_v8i64__12_7() { ; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 ; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s46, s18 ; GFX90A-NEXT: s_mov_b32 s47, s19 @@ -22348,10 +22348,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() { ; GFX900-NEXT: v_writelane_b32 v0, s50, 6 ; GFX900-NEXT: v_writelane_b32 v0, s51, 7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ; def s[36:51] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s50, s18 ; GFX900-NEXT: s_mov_b32 s51, s19 @@ -22389,10 +22389,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() { ; GFX90A-NEXT: v_writelane_b32 v0, s50, 6 ; GFX90A-NEXT: v_writelane_b32 v0, s51, 7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ; def s[36:51] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s50, s18 ; GFX90A-NEXT: s_mov_b32 s51, s19 @@ -22424,10 +22424,10 @@ define void @s_shuffle_v2i64_v8i64__14_7() { ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s30, s14 ; GFX942-NEXT: s_mov_b32 s31, s15 diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index 8fcaf5e15f7d5..64e20a0d284a7 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -30,17 +30,17 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_add_u32 s0, s0, s17 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_getpc_b64 s[18:19] -; GFX9-NEXT: s_add_u32 s18, s18, indirect@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s19, s19, indirect@rel32@hi+12 ; GFX9-NEXT: s_mov_b32 s13, s15 ; GFX9-NEXT: s_mov_b32 s12, s14 ; GFX9-NEXT: s_mov_b64 s[14:15], src_private_base -; GFX9-NEXT: v_mov_b32_e32 v5, s18 +; GFX9-NEXT: s_getpc_b64 s[18:19] +; GFX9-NEXT: s_add_u32 s18, s18, indirect@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s19, s19, indirect@rel32@hi+12 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-NEXT: v_mov_b32_e32 v5, s18 ; GFX9-NEXT: v_mov_b32_e32 v6, s19 ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b32 s14, s16 diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll index 101787abf8ea7..f1f0d737b08fc 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -61,9 +61,9 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -78,9 +78,9 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -356,9 +356,9 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -373,9 +373,9 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -437,9 +437,9 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -454,9 +454,9 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -549,9 +549,9 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0, 0xbff00000 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -566,9 +566,9 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0, 0xbff00000 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll index 41ffd01fc7e23..c88557e7d0ea3 100644 --- a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll +++ b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll @@ -7,9 +7,9 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) %in, ptr addrspace(1) ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX942-NEXT: v_mov_b64_e32 v[12:13], v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index 76f8f484fc763..24e737d50459e 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -359,9 +359,9 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0 ; VI-NEXT: s_or_b32 s2, s4, s2 ; VI-NEXT: s_add_i32 s3, s3, 0x20000 ; VI-NEXT: s_add_i32 s2, s2, 0x20000 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll index 1a0f75e048cb9..cc48b4dd02e67 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -31,9 +31,9 @@ define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg, ; GFX908-NEXT: .LBB0_2: ; %use ; GFX908-NEXT: s_nop 2 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a4 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a7 ; GFX908-NEXT: v_accvgpr_write_b32 a4, 4 ; GFX908-NEXT: v_accvgpr_write_b32 a8, 5 ; GFX908-NEXT: v_accvgpr_write_b32 a9, 1 @@ -83,6 +83,7 @@ define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg, ; GFX90A-NEXT: v_accvgpr_read_b32 v7, a5 ; GFX90A-NEXT: v_accvgpr_read_b32 v6, a4 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, 4 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, 5 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, 1 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, 2 @@ -90,7 +91,6 @@ define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg, ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v6 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v7 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, v8 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, v9 @@ -128,9 +128,9 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 { ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: ;;#ASMSTART @@ -139,8 +139,8 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 { ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 -; GFX908-NEXT: v_accvgpr_write_b32 a4, v3 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v3 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v1 ; GFX908-NEXT: ;;#ASMSTART @@ -394,11 +394,11 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 { ; GFX908-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill ; GFX908-NEXT: s_waitcnt vmcnt(4) ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 -; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v0, a[0:3] ; GFX908-NEXT: s_nop 3 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index b5474b8974b29..39af96cd9762a 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10399,11 +10399,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20e0 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20f0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v11 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v13 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v14 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload @@ -10411,6 +10408,9 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v13 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v14 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v3 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v2 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, v1 @@ -10544,124 +10544,124 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX10-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35] ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v88, v58 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v92, v62 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v87, v57 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v86, v56 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v85, v55 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v91, v61 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v90, v60 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v89, v59 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v60, v34 ; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[63:66], s0 ; 16-byte Folded Spill -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v68, v38 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v59, v33 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v58, v32 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v57, v31 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v67, v37 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v66, v36 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v65, v35 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v10 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v72, v42 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v76, v46 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v80, v50 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v84, v54 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v7 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v71, v41 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v8 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v70, v40 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v69, v39 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v40, v14 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v75, v45 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v74, v44 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v73, v43 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v44, v18 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v79, v49 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v78, v48 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v77, v47 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v48, v22 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v83, v53 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v82, v52 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v81, v51 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v52, v26 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v56, v30 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v65, v35 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v8 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v11 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v41, v15 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v45, v19 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v49, v23 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v53, v27 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v13 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v88, v58 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v58, v32 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v68, v38 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v72, v42 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v12 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v76, v46 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v42, v16 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v21 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v80, v50 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v46, v20 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v51, v25 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v84, v54 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v50, v24 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v29 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v54, v28 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ;;#ASMEND -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, v33 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, v53 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v49 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v45 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, v41 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, v37 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v56 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v52 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v48 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v44 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, v40 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, v34 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, v35 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v36 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, v57 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v60 +; GFX10-FLATSCR-NEXT: ;;#ASMSTART +; GFX10-FLATSCR-NEXT: ;;#ASMEND +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, v35 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v65 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v66 +; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[63:66], off, s0 ; 16-byte Folded Reload +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v87, v57 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v85, v55 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v92, v62 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v91, v61 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v89, v59 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v59, v33 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v57, v31 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v67, v37 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v7 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v71, v41 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v69, v39 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v13 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v11 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v75, v45 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v73, v43 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v41, v15 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v79, v49 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v77, v47 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v21 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v45, v19 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v83, v53 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v81, v51 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v51, v25 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v49, v23 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v53, v27 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v54 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v55 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v56 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v50 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v49 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v55 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, v53 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v51 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v52 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v45 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, v46 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, v47 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v48 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, v41 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v42 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, v43 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v44 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, v37 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, v38 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, v39 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, v40 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, v33 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, v57 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v58 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v59 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v60 -; GFX10-FLATSCR-NEXT: ;;#ASMSTART -; GFX10-FLATSCR-NEXT: ;;#ASMEND -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v65 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v66 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v67 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v68 -; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[63:66], off, s0 ; 16-byte Folded Reload ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v59, v89 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v85 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v51, v81 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v77 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v73 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v69 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v60, v90 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v61, v91 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v62, v92 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v85 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v56, v86 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v57, v87 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v58, v88 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v51, v81 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v52, v82 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v53, v83 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v54, v84 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v77 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v48, v78 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v49, v79 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v50, v80 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v73 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v44, v74 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v45, v75 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v46, v76 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v69 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v40, v70 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v41, v71 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v42, v72 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v67 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v68 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index e12e31b14e97d..cf24ebe9be2c1 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -4,6 +4,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck %s --check-prefixes=TONGA ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck %s --check-prefixes=EG + define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i16_7: ; GCN: ; %bb.0: @@ -1649,8 +1650,8 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: s_cmp_ge_u32 s3, s2 ; GCN-NEXT: s_cselect_b32 s8, s4, s3 ; GCN-NEXT: .LBB8_3: -; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GCN-NEXT: s_endpgm @@ -3332,8 +3333,8 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_add_u32 s0, s6, 16 ; TONGA-NEXT: s_addc_u32 s1, s7, 0 ; TONGA-NEXT: v_mov_b32_e32 v0, s0 -; TONGA-NEXT: v_mov_b32_e32 v4, s6 ; TONGA-NEXT: v_mov_b32_e32 v1, s1 +; TONGA-NEXT: v_mov_b32_e32 v4, s6 ; TONGA-NEXT: v_mov_b32_e32 v5, s7 ; TONGA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; TONGA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -6120,18 +6121,18 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_add_u32 s0, s6, 48 -; TONGA-NEXT: v_mov_b32_e32 v0, s6 ; TONGA-NEXT: s_addc_u32 s1, s7, 0 -; TONGA-NEXT: v_mov_b32_e32 v1, s7 ; TONGA-NEXT: s_add_u32 s2, s6, 32 -; TONGA-NEXT: flat_load_dwordx4 v[14:17], v[0:1] +; TONGA-NEXT: v_mov_b32_e32 v0, s6 +; TONGA-NEXT: v_mov_b32_e32 v1, s7 ; TONGA-NEXT: s_addc_u32 s3, s7, 0 +; TONGA-NEXT: flat_load_dwordx4 v[14:17], v[0:1] ; TONGA-NEXT: v_mov_b32_e32 v0, s2 ; TONGA-NEXT: v_mov_b32_e32 v1, s3 ; TONGA-NEXT: flat_load_dwordx4 v[10:13], v[0:1] ; TONGA-NEXT: v_mov_b32_e32 v0, s0 -; TONGA-NEXT: v_mov_b32_e32 v1, s1 ; TONGA-NEXT: s_add_u32 s0, s6, 16 +; TONGA-NEXT: v_mov_b32_e32 v1, s1 ; TONGA-NEXT: s_addc_u32 s1, s7, 0 ; TONGA-NEXT: v_mov_b32_e32 v5, s1 ; TONGA-NEXT: v_mov_b32_e32 v4, s0 @@ -6688,11 +6689,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; TONGA-NEXT: v_cndmask_b32_e32 v14, v0, v1, vcc ; TONGA-NEXT: .LBB12_14: +; TONGA-NEXT: s_add_u32 s0, s4, 16 ; TONGA-NEXT: v_mov_b32_e32 v0, s4 ; TONGA-NEXT: v_mov_b32_e32 v1, s5 -; TONGA-NEXT: s_add_u32 s0, s4, 16 -; TONGA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; TONGA-NEXT: s_addc_u32 s1, s5, 0 +; TONGA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; TONGA-NEXT: v_mov_b32_e32 v0, s0 ; TONGA-NEXT: v_mov_b32_e32 v1, s1 ; TONGA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] @@ -9033,9 +9034,9 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; TONGA-NEXT: v_mov_b32_e32 v4, s4 ; TONGA-NEXT: v_mov_b32_e32 v5, s5 ; TONGA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; TONGA-NEXT: v_mov_b32_e32 v9, s1 ; TONGA-NEXT: v_mov_b32_e32 v8, s0 ; TONGA-NEXT: s_add_u32 s0, s0, 16 +; TONGA-NEXT: v_mov_b32_e32 v9, s1 ; TONGA-NEXT: s_addc_u32 s1, s1, 0 ; TONGA-NEXT: v_mov_b32_e32 v11, s1 ; TONGA-NEXT: v_mov_b32_e32 v10, s0 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 02d2e6c1473ab..4400d52e46fbb 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GCN-IR %s + define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem: ; GCN: ; %bb.0: @@ -433,8 +434,8 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v9, v11, vcc ; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v16 ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 @@ -1189,9 +1190,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s4, s6, s4 ; GCN-IR-NEXT: s_subb_u32 s5, s7, s5 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm @@ -1659,8 +1660,8 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB11_3 @@ -1850,8 +1851,8 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 @@ -1950,10 +1951,10 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 ; GCN-IR-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12 -; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB13_3 diff --git a/llvm/test/CodeGen/AMDGPU/ssubo.ll b/llvm/test/CodeGen/AMDGPU/ssubo.ll index 382d8928a28b0..a5ed626849228 100644 --- a/llvm/test/CodeGen/AMDGPU/ssubo.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubo.ll @@ -348,14 +348,14 @@ define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_u32 s12, s4, s6 -; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_subb_u32 s13, s5, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] ; SI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[6:7], 0 -; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; SI-NEXT: s_mov_b32 s0, s2 @@ -374,9 +374,9 @@ define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_sub_u32 s0, s4, s6 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_subb_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -434,8 +434,8 @@ define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_subb_u32 s9, s5, s7 ; GFX11-NEXT: v_cmp_gt_i64_e64 s6, s[6:7], 0 ; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5] -; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: s_xor_b32 s4, s6, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll index d33e94809b326..d201b1124a3a3 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -16,19 +16,19 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; MUBUF-NEXT: s_mov_b32 s38, -1 ; MUBUF-NEXT: s_mov_b32 s39, 0x31c16000 ; MUBUF-NEXT: s_add_u32 s36, s36, s11 -; MUBUF-NEXT: s_addc_u32 s37, s37, 0 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x2000 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 ; MUBUF-NEXT: v_mov_b32_e32 v3, 0 ; MUBUF-NEXT: v_mov_b32_e32 v4, 0x400000 +; MUBUF-NEXT: s_addc_u32 s37, s37, 0 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12 +; MUBUF-NEXT: s_mov_b64 s[2:3], s[38:39] ; MUBUF-NEXT: s_mov_b32 s32, 0xc0000 ; MUBUF-NEXT: s_waitcnt lgkmcnt(0) ; MUBUF-NEXT: v_mov_b32_e32 v0, s0 ; MUBUF-NEXT: s_mov_b64 s[0:1], s[36:37] -; MUBUF-NEXT: s_mov_b64 s[2:3], s[38:39] ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; MUBUF-NEXT: s_and_saveexec_b32 s0, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index d2394bab82c77..a8d93c61d0424 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -855,13 +855,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-OPT-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 ; WAVE32-OPT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; WAVE32-OPT-NEXT: s_movk_i32 s32, 0x1200 +; WAVE32-OPT-NEXT: v_mov_b32_e32 v3, 42 +; WAVE32-OPT-NEXT: v_mov_b32_e32 v4, 17 ; WAVE32-OPT-NEXT: s_mov_b32 s13, s9 +; WAVE32-OPT-NEXT: v_or3_b32 v31, v0, v1, v2 ; WAVE32-OPT-NEXT: s_mov_b32 s12, s8 ; WAVE32-OPT-NEXT: s_mov_b64 s[8:9], s[4:5] ; WAVE32-OPT-NEXT: s_mov_b32 s4, s32 -; WAVE32-OPT-NEXT: v_mov_b32_e32 v3, 42 -; WAVE32-OPT-NEXT: v_mov_b32_e32 v4, 17 -; WAVE32-OPT-NEXT: v_or3_b32 v31, v0, v1, v2 ; WAVE32-OPT-NEXT: s_mov_b32 s14, s10 ; WAVE32-OPT-NEXT: s_mov_b32 s17, stack_passed_argument@abs32@hi ; WAVE32-OPT-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo @@ -892,13 +892,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-OPT-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 ; WAVE64-OPT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; WAVE64-OPT-NEXT: s_movk_i32 s32, 0x2400 +; WAVE64-OPT-NEXT: v_mov_b32_e32 v3, 42 +; WAVE64-OPT-NEXT: v_mov_b32_e32 v4, 17 ; WAVE64-OPT-NEXT: s_mov_b32 s13, s9 +; WAVE64-OPT-NEXT: v_or3_b32 v31, v0, v1, v2 ; WAVE64-OPT-NEXT: s_mov_b32 s12, s8 ; WAVE64-OPT-NEXT: s_mov_b64 s[8:9], s[4:5] ; WAVE64-OPT-NEXT: s_mov_b32 s4, s32 -; WAVE64-OPT-NEXT: v_mov_b32_e32 v3, 42 -; WAVE64-OPT-NEXT: v_mov_b32_e32 v4, 17 -; WAVE64-OPT-NEXT: v_or3_b32 v31, v0, v1, v2 ; WAVE64-OPT-NEXT: s_mov_b32 s14, s10 ; WAVE64-OPT-NEXT: s_mov_b32 s17, stack_passed_argument@abs32@hi ; WAVE64-OPT-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll index a4e23ae87614f..b2b9f5bc93365 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -67,9 +67,9 @@ define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, s6 +; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: ds_store_b128 v4, v[0:3] ; GFX11-NEXT: s_endpgm store <4 x i32> %x, ptr addrspace(3) %out @@ -521,8 +521,8 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-NEXT: s_endpgm @@ -535,8 +535,8 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX7-NEXT: s_endpgm @@ -575,10 +575,9 @@ define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i3 ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, s6 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: ds_store_2addr_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX11-NEXT: s_endpgm store <4 x i32> %x, ptr addrspace(3) %out, align 8 @@ -647,9 +646,9 @@ define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x0 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, s6 +; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: ds_store_b128 v4, v[0:3] ; GFX11-NEXT: s_endpgm store <4 x i32> %x, ptr addrspace(3) %out, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index 480eb0dd5fe9c..d9308e9188add 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -55,8 +55,8 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; HAWAII-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HAWAII-NEXT: s_add_u32 s0, s8, 14 ; HAWAII-NEXT: s_addc_u32 s1, s9, 0 -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: s_mov_b32 flat_scratch_lo, s13 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: v_mov_b32_e32 v1, s1 ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] ; HAWAII-NEXT: s_load_dword s2, s[8:9], 0x0 @@ -291,8 +291,8 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: ds_write_b8 v2, v3 offset:8 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll index b4036517cc0d5..af206fb40e5b2 100644 --- a/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll +++ b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll @@ -94,8 +94,8 @@ define amdgpu_kernel void @test_loop_with_if( ptr %ptr, i1 %cond) #0 { ; GFX900-NEXT: s_bitcmp1_b32 s2, 0 ; GFX900-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX900-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[2:3] -; GFX900-NEXT: v_mov_b32_e32 v2, s1 ; GFX900-NEXT: s_xor_b64 s[2:3], s[2:3], -1 +; GFX900-NEXT: v_mov_b32_e32 v2, s1 ; GFX900-NEXT: v_mov_b32_e32 v1, s0 ; GFX900-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v3 ; GFX900-NEXT: s_branch .LBB2_2 diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index d10ef709f8e33..5cbf4d66314d9 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -321,8 +321,8 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s2, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_addc_u32 s3, s3, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 @@ -666,8 +666,8 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 ; GFX8-NEXT: s_sub_u32 s0, s0, s2 ; GFX8-NEXT: s_subb_u32 s1, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm @@ -692,8 +692,9 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[2:3] -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-NEXT: s_endpgm %result = sub i64 %a, %b @@ -944,9 +945,9 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13] -; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: v_mov_b32_e32 v16, s0 ; GFX8-NEXT: s_add_u32 s0, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 @@ -958,8 +959,8 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v10, v14 ; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v11, v15, vcc ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v8, v12 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v9, v13, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll index 4621be5cab450..007a384ca9299 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -12,20 +12,15 @@ define amdgpu_kernel void @foobar(float %a0, float %a1, ptr addrspace(1) %out) # ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: v_mov_b32_e32 v2, s6 -; CHECK-NEXT: v_mov_b32_e32 v3, s7 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc ; CHECK-NEXT: ; %bb.1: ; %ift ; CHECK-NEXT: s_mov_b32 s4, s5 ; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: v_mov_b32_e32 v2, s6 -; CHECK-NEXT: v_mov_b32_e32 v3, s7 ; CHECK-NEXT: ; %bb.2: ; %ife ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll index d4a8a0d762afd..81383725c5ac5 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=4 -o - %s | FileCheck %s + ; Make sure we can rematerialize split 64-bit constants (which ; MachineLICM hoisted out of the loop) and avoid spilling inside the ; loop. @@ -15,16 +16,16 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce) ; CHECK-LABEL: _Z6kernelILi4000ELi1EEvPd: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b64 s[2:3], 0x100 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 -; CHECK-NEXT: s_load_dword s0, s[0:1], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 ; CHECK-NEXT: s_mov_b32 s2, 0 +; CHECK-NEXT: s_load_dword s0, s[0:1], 0x0 ; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: s_mov_b32 s3, 0x40260000 ; CHECK-NEXT: s_mov_b32 s5, 0x40280000 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: .LBB0_1: ; %for.cond4.preheader ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index 9c0beb2ed358c..1996a8e272c5a 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -135,8 +135,8 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; HSA-TRAP-GFX803-NEXT: v_cmp_eq_u32_e32 vcc, -1, v0 ; HSA-TRAP-GFX803-NEXT: s_cbranch_vccz .LBB1_2 ; HSA-TRAP-GFX803-NEXT: ; %bb.1: ; %ret -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 3 +; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1 ; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) @@ -423,9 +423,9 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 ; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1 +; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1 ; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store.ll b/llvm/test/CodeGen/AMDGPU/trunc-store.ll index 1d96921ec1287..6ac3a4205613b 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-store.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-store.ll @@ -94,11 +94,11 @@ define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(ptr addrspace(1) %out, ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -201,11 +201,11 @@ define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(ptr addrspace(1) %out, ; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: s_or_b32 s5, s7, s5 ; VI-NEXT: s_or_b32 s0, s0, s4 -; VI-NEXT: v_mov_b32_e32 v4, s34 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v4, s34 ; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll index 76f60f1e5dbfc..a6c126a937e2a 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc.ll @@ -99,8 +99,8 @@ define amdgpu_kernel void @trunc_shl_i64(ptr addrspace(1) %out2, ptr addrspace(1 ; SI-NEXT: s_mov_b32 s1, s3 ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 1f93bf7a68972..0a3ef92e851a3 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -1760,8 +1760,8 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_add_u32 s6, s2, 2 ; GCN-NEXT: s_addc_u32 s7, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_add_u32 s6, s2, 6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_addc_u32 s7, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 @@ -1960,8 +1960,8 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: s_add_u32 s6, s2, 2 ; GCN-NEXT: s_addc_u32 s7, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_add_u32 s6, s2, 6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_addc_u32 s7, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 1c50f930facba..5620ad29f6c36 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GCN-IR %s + define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv_i64: ; GCN: ; %bb.0: @@ -186,9 +187,9 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[2:3] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm @@ -372,8 +373,8 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v7 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 @@ -961,9 +962,9 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 ; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] ; GCN-IR-NEXT: .LBB8_5: ; %udiv-end -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm @@ -1132,8 +1133,8 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3 @@ -1215,10 +1216,10 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 ; GCN-IR-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 -; GCN-IR-NEXT: v_or_b32_e32 v3, v7, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v7, v5 +; GCN-IR-NEXT: v_or_b32_e32 v3, v7, v3 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v6, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB10_3 @@ -1324,9 +1325,9 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 ; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] ; GCN-IR-NEXT: .LBB11_5: ; %udiv-end -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm @@ -1413,10 +1414,10 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 ; GCN-IR-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 -; GCN-IR-NEXT: v_or_b32_e32 v3, v7, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v7, v5 +; GCN-IR-NEXT: v_or_b32_e32 v3, v7, v3 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v6, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB12_3 diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll index eaab3531824c4..93c9dd4b672b3 100644 --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -218,6 +218,7 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX8-NEXT: s_sub_i32 s6, 0, s2 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -240,7 +241,6 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; GFX8-NEXT: s_cselect_b32 s0, s6, s0 ; GFX8-NEXT: s_sub_i32 s2, 0, s3 ; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll index 983acfc2c0699..6f60ce0b64787 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -211,9 +211,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 ; SI-NEXT: v_ldexp_f64 v[6:7], v[8:9], 32 ; SI-NEXT: v_ldexp_f64 v[8:9], v[10:11], 32 ; SI-NEXT: s_add_u32 s0, s8, 16 -; SI-NEXT: s_addc_u32 s1, s9, 0 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] ; SI-NEXT: v_add_f64 v[4:5], v[8:9], v[12:13] +; SI-NEXT: s_addc_u32 s1, s9, 0 ; SI-NEXT: v_mov_b32_e32 v9, s1 ; SI-NEXT: v_mov_b32_e32 v8, s0 ; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] @@ -250,8 +250,8 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 ; VI-NEXT: s_add_u32 s0, s8, 16 ; VI-NEXT: s_addc_u32 s1, s9, 0 ; VI-NEXT: v_mov_b32_e32 v11, s1 -; VI-NEXT: v_mov_b32_e32 v8, s8 ; VI-NEXT: v_mov_b32_e32 v10, s0 +; VI-NEXT: v_mov_b32_e32 v8, s8 ; VI-NEXT: v_mov_b32_e32 v9, s9 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -368,11 +368,11 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 ; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s2 -; SI-NEXT: s_add_u32 s0, s4, 16 ; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 +; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; SI-NEXT: s_add_u32 s0, s4, 16 ; SI-NEXT: s_addc_u32 s1, s5, 0 ; SI-NEXT: v_mov_b32_e32 v9, s1 ; SI-NEXT: v_mov_b32_e32 v8, s0 @@ -391,11 +391,11 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], s2 -; VI-NEXT: s_add_u32 s0, s4, 16 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 +; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; VI-NEXT: s_add_u32 s0, s4, 16 ; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_mov_b32_e32 v8, s0 @@ -437,9 +437,9 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm @@ -454,9 +454,9 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -615,9 +615,9 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm @@ -632,9 +632,9 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -696,9 +696,9 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm @@ -713,9 +713,9 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -778,9 +778,9 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000 -; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm @@ -795,9 +795,9 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll index 22e4a24435f12..ce52b91723371 100644 --- a/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll @@ -209,8 +209,8 @@ define amdgpu_ps i64 @s_underflow_compare_fold_i64(i64 inreg %a, i64 inreg %b) # ; GFX9-LABEL: s_underflow_compare_fold_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sub_u32 s2, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s3, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll index fc32bc644ddcd..e85536559a0ca 100644 --- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll +++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s + declare ptr @G() define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x i32> %vec) { @@ -94,13 +95,11 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: .LBB0_4: ; %Flow8 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] ; CHECK-NEXT: v_pk_mov_b32 v[0:1], v[42:43], v[42:43] op_sel:[0,1] -; CHECK-NEXT: v_pk_mov_b32 v[2:3], v[44:45], v[44:45] op_sel:[0,1] ; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB0_8 ; CHECK-NEXT: ; %bb.5: ; %LeafBlock ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v42 ; CHECK-NEXT: v_pk_mov_b32 v[0:1], v[42:43], v[42:43] op_sel:[0,1] -; CHECK-NEXT: v_pk_mov_b32 v[2:3], v[44:45], v[44:45] op_sel:[0,1] ; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc ; CHECK-NEXT: ; %bb.6: ; %sw.bb.i.i.i.i ; CHECK-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll index 31708a9b738db..549adae64ab37 100644 --- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll @@ -103,9 +103,6 @@ define amdgpu_kernel void @partially_undef_copy() #0 { ; CHECK-NEXT: v_mov_b32_e32 v6, 6 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_mov_b32_e32 v0, v5 -; CHECK-NEXT: v_mov_b32_e32 v1, v6 -; CHECK-NEXT: v_mov_b32_e32 v2, v7 -; CHECK-NEXT: v_mov_b32_e32 v3, v8 ; CHECK-NEXT: s_mov_b32 s3, 0xf000 ; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: v_mov_b32_e32 v0, v6 diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll index a81d9a458e23a..69e2eedaa4c86 100644 --- a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll @@ -64,6 +64,22 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg, ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[32:63], v[0:31] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 +; CHECK-NEXT: v_accvgpr_write_b32 a41, v9 +; CHECK-NEXT: v_accvgpr_write_b32 a40, v8 +; CHECK-NEXT: v_accvgpr_write_b32 a39, v7 +; CHECK-NEXT: v_accvgpr_write_b32 a38, v6 +; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 +; CHECK-NEXT: v_accvgpr_write_b32 a35, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a34, v2 +; CHECK-NEXT: v_accvgpr_write_b32 a33, v1 +; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def v[6:9] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_accvgpr_write_b32 a63, v31 ; CHECK-NEXT: v_accvgpr_write_b32 a62, v30 ; CHECK-NEXT: v_accvgpr_write_b32 a61, v29 @@ -86,24 +102,8 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg, ; CHECK-NEXT: v_accvgpr_write_b32 a44, v12 ; CHECK-NEXT: v_accvgpr_write_b32 a43, v11 ; CHECK-NEXT: v_accvgpr_write_b32 a42, v10 -; CHECK-NEXT: v_accvgpr_write_b32 a41, v9 -; CHECK-NEXT: v_accvgpr_write_b32 a40, v8 -; CHECK-NEXT: v_accvgpr_write_b32 a39, v7 -; CHECK-NEXT: v_accvgpr_write_b32 a38, v6 ; CHECK-NEXT: v_accvgpr_write_b32 a37, v5 ; CHECK-NEXT: v_accvgpr_write_b32 a36, v4 -; CHECK-NEXT: v_accvgpr_write_b32 a35, v3 -; CHECK-NEXT: v_accvgpr_write_b32 a34, v2 -; CHECK-NEXT: v_accvgpr_write_b32 a33, v1 -; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[6:9] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:31] ; CHECK-NEXT: ;;#ASMEND @@ -268,6 +268,8 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[32:63], v[0:31] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_accvgpr_write_b32 a63, v31 ; CHECK-NEXT: v_accvgpr_write_b32 a62, v30 ; CHECK-NEXT: v_accvgpr_write_b32 a61, v29 @@ -299,9 +301,7 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar ; CHECK-NEXT: v_accvgpr_write_b32 a35, v3 ; CHECK-NEXT: v_accvgpr_write_b32 a34, v2 ; CHECK-NEXT: v_accvgpr_write_b32 a33, v1 -; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 ; CHECK-NEXT: v_accvgpr_read_b32 v7, a3 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_accvgpr_read_b32 v6, a2 ; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 28e6627b87413..69f1f73683c0f 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GCN-IR %s + define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_urem_i64: ; GCN: ; %bb.0: @@ -408,8 +409,8 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v9, v11, vcc ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v14 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 @@ -1268,8 +1269,8 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v12 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB8_3 @@ -1357,10 +1358,10 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 ; GCN-IR-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v10 -; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 +; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 ; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz .LBB9_3 diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index ca93fcf3f55a2..cc2a62d49006b 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -48,10 +48,10 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2361,10 +2361,10 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2473,11 +2473,11 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 ; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 99b6ab7a6401b..8d29bca970541 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -248,8 +248,8 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2 ; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16 ; GISEL-VI-NEXT: s_or_b32 s2, s2, s3 -; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-VI-NEXT: flat_store_dword v[0:1], v2 ; GISEL-VI-NEXT: s_endpgm @@ -749,8 +749,8 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2 ; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16 ; GISEL-VI-NEXT: s_or_b32 s2, s2, s3 -; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-VI-NEXT: flat_store_dword v[0:1], v2 ; GISEL-VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index 6bf6d540299f1..1922f61b56793 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -2212,10 +2212,10 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 -; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3] ; GFX11-NEXT: s_endpgm %ld8 = load <8 x i32>, ptr addrspace(4) %in, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 0ae31be32ed51..efa59eb8a9ae5 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -5,6 +5,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-early-ifcvt=1 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032,GFX10DEFWAVE %s + define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vopc_i32: ; GFX1032: ; %bb.0: @@ -872,8 +873,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: s_cmp_ge_u32 s2, s0 ; GFX1032-NEXT: s_cselect_b32 s8, s3, s1 ; GFX1032-NEXT: .LBB15_3: -; GFX1032-NEXT: v_mov_b32_e32 v0, s8 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v0, s8 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 ; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] offset:16 ; GFX1032-NEXT: s_endpgm @@ -1024,8 +1025,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: s_cmp_ge_u32 s2, s0 ; GFX1064-NEXT: s_cselect_b32 s4, s3, s1 ; GFX1064-NEXT: .LBB15_3: -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: v_mov_b32_e32 v1, s5 ; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] offset:16 ; GFX1064-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll index bfad131dc4413..96808e5c39ab5 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll @@ -1273,11 +1273,10 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i ; GISEL-NEXT: s_mov_b32 s1, s6 ; GISEL-NEXT: s_mov_b32 s2, s7 ; GISEL-NEXT: s_mov_b32 s3, s8 -; GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) +; GISEL-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s1 ; GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GISEL-NEXT: v_mov_b32_e32 v5, s9 +; GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s9 ; GISEL-NEXT: scratch_store_b32 off, v4, s10 ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: scratch_store_b128 off, v[0:3], s11 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 0fdc1a83dddbd..57e0a0dfb3ea8 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -1943,9 +1943,9 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { ; GFX9-W64-NEXT: s_cbranch_execz .LBB35_4 ; GFX9-W64-NEXT: .LBB35_2: ; %loop ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s4, v8 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s4, v8 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v6 ; GFX9-W64-NEXT: v_mov_b32_e32 v3, v7 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 72672c8b6efad..1e27497dac736 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -422,13 +422,13 @@ define amdgpu_kernel void @call(ptr addrspace(8) %tmp14, i32 %arg) { ; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[20:21] ; GFX9-O3-NEXT: s_addc_u32 s9, s5, 0 ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 -; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O3-NEXT: s_getpc_b64 s[22:23] ; GFX9-O3-NEXT: s_add_u32 s22, s22, called@rel32@lo+4 ; GFX9-O3-NEXT: s_addc_u32 s23, s23, called@rel32@hi+12 +; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] @@ -678,8 +678,8 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) %tmp14, i64 %arg) { ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 ; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v6 @@ -1252,13 +1252,13 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) %tmp14, i32 %arg) { ; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[20:21] ; GFX9-O3-NEXT: s_addc_u32 s9, s5, 0 ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 -; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O3-NEXT: s_getpc_b64 s[22:23] ; GFX9-O3-NEXT: s_add_u32 s22, s22, strict_wwm_called@rel32@lo+4 ; GFX9-O3-NEXT: s_addc_u32 s23, s23, strict_wwm_called@rel32@hi+12 +; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] @@ -1508,8 +1508,8 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) %tmp14, i64 %arg ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 ; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] ; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v6 diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll index 92280b9ad8acf..8e10f21e3a089 100644 --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -643,9 +643,9 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s1, s1, 0xf237b ; VI-NEXT: s_xor_b32 s0, s0, 0x3039 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -685,13 +685,13 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou ; VI-NEXT: s_mov_b32 s7, 0xf237b ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7] -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s2, 0x3039 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_addc_u32 s1, s3, 0xf237b +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] @@ -725,9 +725,9 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b32 s0, s0, 63 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -757,8 +757,8 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], -8 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm From 7bdbd3ae0f5fa13489ae2fba32f38fcfa0a10f44 Mon Sep 17 00:00:00 2001 From: vikashgu Date: Thu, 11 Dec 2025 07:59:52 +0000 Subject: [PATCH 4/4] Encoded liveness info as MO_laneMask using the COPY_LANEMASK instruction. --- llvm/include/llvm/CodeGen/MachineInstr.h | 5 +- llvm/include/llvm/Target/Target.td | 2 +- llvm/lib/CodeGen/ExpandPostRAPseudos.cpp | 1 + llvm/lib/CodeGen/TargetInstrInfo.cpp | 16 +++++- llvm/lib/CodeGen/VirtRegMap.cpp | 57 +++++++++++-------- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 +- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 4 +- llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir | 8 +-- .../test/CodeGen/AMDGPU/carryout-selection.ll | 23 ++++---- ...hys-reg-implicit-operand-kills-subregs.mir | 2 +- llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir | 4 +- .../greedy-alloc-fail-sgpr1024-spill.mir | 8 +-- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll | 5 +- .../ran-out-of-sgprs-allocation-failure.mir | 14 ++--- .../AMDGPU/snippet-copy-bundle-regression.mir | 2 +- llvm/test/CodeGen/X86/shift-i128.ll | 4 ++ llvm/test/CodeGen/X86/swift-return.ll | 18 +++--- 17 files changed, 104 insertions(+), 73 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h index 8e2574974a82d..d969178f74179 100644 --- a/llvm/include/llvm/CodeGen/MachineInstr.h +++ b/llvm/include/llvm/CodeGen/MachineInstr.h @@ -1473,8 +1473,9 @@ class MachineInstr /// Return true is the instruction is an identity copy. bool isIdentityCopy() const { - return isCopy() && getOperand(0).getReg() == getOperand(1).getReg() && - getOperand(0).getSubReg() == getOperand(1).getSubReg(); + return (isCopy() || isCopyLaneMask()) && + getOperand(0).getReg() == getOperand(1).getReg() && + getOperand(0).getSubReg() == getOperand(1).getSubReg(); } /// Return true if this is a transient instruction that is either very likely diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td index ae2181151b351..315de55b75510 100644 --- a/llvm/include/llvm/Target/Target.td +++ b/llvm/include/llvm/Target/Target.td @@ -1346,7 +1346,7 @@ def REG_SEQUENCE : StandardPseudoInstruction { } def COPY : StandardPseudoInstruction { let OutOperandList = (outs unknown:$dst); - let InOperandList = (ins unknown:$src, variable_ops); + let InOperandList = (ins unknown:$src); let AsmString = ""; let hasSideEffects = false; let isAsCheapAsAMove = true; diff --git a/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp b/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp index c4ba4195f307f..7a291844c8670 100644 --- a/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp +++ b/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp @@ -168,6 +168,7 @@ bool ExpandPostRA::run(MachineFunction &MF) { MadeChange |= LowerSubregToReg(&MI); break; case TargetOpcode::COPY: + case TargetOpcode::COPY_LANEMASK: TII->lowerCopy(&MI, TRI); MadeChange = true; break; diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index fef3a3663d3a8..b8b83e626fe86 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -884,12 +884,21 @@ void TargetInstrInfo::lowerCopy( if (IdentityCopy || SrcMO.isUndef()) { // No need to insert an identity copy instruction, but replace with a KILL // if liveness is changed. - if (SrcMO.isUndef() || MI->getNumOperands() > 2) { + if (MI->getOpcode() == TargetOpcode::COPY && + (SrcMO.isUndef() || MI->getNumOperands() > 2)) { // We must make sure the super-register gets killed. Replace the // instruction with KILL. MI->setDesc(get(TargetOpcode::KILL)); return; } + if (MI->getOpcode() == TargetOpcode::COPY_LANEMASK && + (SrcMO.isUndef() || MI->getNumOperands() > 3)) { + // We must make sure the super-register gets killed. Replace the + // instruction with KILL. + MI->setDesc(get(TargetOpcode::KILL)); + return; + } + // Vanilla identity copy. MI->eraseFromParent(); return; @@ -900,7 +909,10 @@ void TargetInstrInfo::lowerCopy( DstMO.getReg().isPhysical() ? DstMO.isRenamable() : false, SrcMO.getReg().isPhysical() ? SrcMO.isRenamable() : false); - if (MI->getNumOperands() > 2) + if (MI->getOpcode() == TargetOpcode::COPY && MI->getNumOperands() > 2) + transferImplicitOperands(MI, &TRI); + if (MI->getOpcode() == TargetOpcode::COPY_LANEMASK && + MI->getNumOperands() > 3) transferImplicitOperands(MI, &TRI); MI->eraseFromParent(); } diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp index 1b299305bd450..cab3fca289a32 100644 --- a/llvm/lib/CodeGen/VirtRegMap.cpp +++ b/llvm/lib/CodeGen/VirtRegMap.cpp @@ -213,8 +213,8 @@ class VirtRegRewriter { void rewrite(); void addMBBLiveIns(); bool readsUndefSubreg(const MachineOperand &MO) const; - uint64_t calcLiveRegUnitMask(const MachineOperand &MO, - MCRegister PhysReg) const; + LaneBitmask calcLiveRegUnitMask(const MachineOperand &MO, + MCRegister PhysReg) const; void addLiveInsForSubRanges(const LiveInterval &LI, MCRegister PhysReg) const; void handleIdentityCopy(MachineInstr &MI); void expandCopyBundle(MachineInstr &MI) const; @@ -476,11 +476,11 @@ bool VirtRegRewriter::readsUndefSubreg(const MachineOperand &MO) const { return true; } -// Return LaneBitmask value as unint64_t for PhysReg assigned to MO, +// Return LaneBitmask value for PhysReg assigned to MO, // representing its live register units at its parent MI. In case of undef or // fully live MO, return 0u. -uint64_t VirtRegRewriter::calcLiveRegUnitMask(const MachineOperand &MO, - MCRegister PhysReg) const { +LaneBitmask VirtRegRewriter::calcLiveRegUnitMask(const MachineOperand &MO, + MCRegister PhysReg) const { Register Reg = MO.getReg(); const LiveInterval &LI = LIS->getInterval(Reg); const MachineInstr &MI = *MO.getParent(); @@ -492,20 +492,20 @@ uint64_t VirtRegRewriter::calcLiveRegUnitMask(const MachineOperand &MO, : LaneBitmask::getNone()); LaneBitmask LiveRegUnitMask; - DenseSet LiveRegUnits; + DenseSet LiveRegUnits; // dbgs() << "\n********** " << printReg(Reg, TRI) << "[ " << // printReg(PhysReg, TRI) << " ]" << " **********\n"; if (MO.isUndef()) - return 0u; + return LaneBitmask::getNone(); assert(LI.liveAt(MIIndex) && "Reads of completely dead register should be marked undef already"); if (LI.hasSubRanges()) { for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { - unsigned Unit = (*Units).first; + MCRegUnit Unit = (*Units).first; LaneBitmask Mask = (*Units).second; for (const LiveInterval::SubRange &S : LI.subranges()) { if ((S.LaneMask & UseMask & Mask).any() && S.liveAt(MIIndex)) { @@ -515,7 +515,7 @@ uint64_t VirtRegRewriter::calcLiveRegUnitMask(const MachineOperand &MO, } } else { for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { - unsigned Unit = (*Units).first; + MCRegUnit Unit = (*Units).first; const LiveRange &UnitRange = LIS->getRegUnit(Unit); LaneBitmask Mask = (*Units).second; @@ -531,7 +531,7 @@ uint64_t VirtRegRewriter::calcLiveRegUnitMask(const MachineOperand &MO, } for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { - unsigned Unit = (*Units).first; + MCRegUnit Unit = (*Units).first; LaneBitmask Mask = (*Units).second; if (LiveRegUnits.count(Unit)) { // dbgs() << "LIVE DEF UNIT : " << printRegUnit(Unit, TRI) << '\n'; @@ -541,10 +541,13 @@ uint64_t VirtRegRewriter::calcLiveRegUnitMask(const MachineOperand &MO, // dbgs() << "UseMask : " << PrintLaneMask(UseMask) << '\n'; // dbgs() << "LiveRegUnitMask : " << PrintLaneMask(LiveRegUnitMask) << '\n'; - if (UseMask == LiveRegUnitMask) - return 0u; + // If all lanes are live or dead, no need to create a COPY_LANEMASK + // instruction. + if (LiveRegUnitMask.all() || LiveRegUnitMask.none() || + LiveRegUnitMask == UseMask) + return LaneBitmask::getNone(); - return LiveRegUnitMask.getAsInteger(); + return LiveRegUnitMask; } void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) { @@ -568,11 +571,14 @@ void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) { // give us additional liveness information: The target (super-)register // must not be valid before this point. Replace the COPY with a KILL // instruction to maintain this information. - - // Avoid COPY with an exact 3 operand, wiith third operand be Mask, as - // it same as a COPY with no additional liveness information. - if (MI.getOperand(1).isUndef() || MI.getNumOperands() > 3 || - (MI.getNumOperands() == 3 && !MI.getOperand(2).isImm())) { + if (MI.getOpcode() == TargetOpcode::COPY && + (MI.getOperand(1).isUndef() || MI.getNumOperands() > 2)) { + MI.setDesc(TII->get(TargetOpcode::KILL)); + LLVM_DEBUG(dbgs() << " replace by: " << MI); + return; + } + if (MI.getOpcode() == TargetOpcode::COPY_LANEMASK && + (MI.getOperand(1).isUndef() || MI.getNumOperands() > 3)) { MI.setDesc(TII->get(TargetOpcode::KILL)); LLVM_DEBUG(dbgs() << " replace by: " << MI); return; @@ -718,14 +724,14 @@ void VirtRegRewriter::rewrite() { SmallVector SuperDeads; SmallVector SuperDefs; SmallVector SuperKills; - uint64_t Mask; + LaneBitmask LaneMask; for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end(); MBBI != MBBE; ++MBBI) { LLVM_DEBUG(MBBI->print(dbgs(), Indexes)); for (MachineInstr &MI : llvm::make_early_inc_range(MBBI->instrs())) { // reset for each MI. - Mask = 0u; + LaneMask = LaneBitmask::getNone(); for (MachineOperand &MO : MI.operands()) { // Make sure MRI knows about registers clobbered by regmasks. if (MO.isRegMask()) @@ -744,7 +750,7 @@ void VirtRegRewriter::rewrite() { assert(!MRI->isReserved(PhysReg) && "Reserved register assignment"); if (MO.isUse() && MI.isCopy()) - Mask = calcLiveRegUnitMask(MO, PhysReg); + LaneMask = calcLiveRegUnitMask(MO, PhysReg); // Preserve semantics of sub-register operands. unsigned SubReg = MO.getSubReg(); @@ -822,9 +828,12 @@ void VirtRegRewriter::rewrite() { MO.setIsRenamable(true); } - // Add LaneBitmask as MO_Imm - if (MI.isCopy() && Mask) - MI.addOperand(*MF, MachineOperand::CreateImm(Mask)); + // If there are any live lanes, replace a COPY instruction with a + // COPY_LANEMASK instruction with the lane mask. + if (MI.isCopy() && LaneMask.any()) { + MI.setDesc(TII->get(TargetOpcode::COPY_LANEMASK)); + MI.addOperand(*MF, MachineOperand::CreateLaneMask(LaneMask)); + } // Add any missing super-register kills after rewriting the whole // instruction. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 6149b6d969717..d317cadf15be2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -820,8 +820,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned SrcSize = RI.getRegSizeInBits(*SrcRC); uint64_t LiveRegUnitMaskVal = 0; - if (MI->getNumOperands() > 2 && MI->getOperand(2).isImm()) { - LiveRegUnitMaskVal = MI->getOperand(2).getImm(); + if (MI->getOpcode() == TargetOpcode::COPY_LANEMASK) { + LiveRegUnitMaskVal = MI->getOperand(2).getLaneMask().getAsInteger(); } bool isSrcRegFullLive = LiveRegUnitMaskVal == 0; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index a3bacfbfe5214..89641cf21c55d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -262,7 +262,9 @@ static bool isConvertibleToVMV_V_V(const RISCVSubtarget &STI, if (PreferWholeRegisterMove) return false; - assert(MBBI->getOpcode() == TargetOpcode::COPY && + // TODO : Support COPY_LANEMASK instruction. + assert((MBBI->getOpcode() == TargetOpcode::COPY || + MBBI->getOpcode() == TargetOpcode::COPY_LANEMASK) && "Unexpected COPY instruction."); Register SrcReg = MBBI->getOperand(1).getReg(); const TargetRegisterInfo *TRI = STI.getRegisterInfo(); diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir index 4e9797e2686cd..a8b82d8fd16e5 100644 --- a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir @@ -1375,7 +1375,7 @@ body: | ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 S_NOP 0, implicit-def dead $sgpr0_sgpr1 - renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable $sgpr0_sgpr1_sgpr2_sgpr3, 240, implicit $exec + renamable $agpr4_agpr5_agpr6_agpr7 = COPY_LANEMASK renamable $sgpr0_sgpr1_sgpr2_sgpr3, lanemask(240), implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ... --- @@ -1411,7 +1411,7 @@ body: | ; GFX942-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr2, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 S_NOP 0, implicit-def dead $sgpr0_sgpr1 - renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $sgpr0_sgpr1_sgpr2_sgpr3, 240, implicit $exec + renamable $agpr4_agpr5_agpr6_agpr7 = COPY_LANEMASK renamable killed $sgpr0_sgpr1_sgpr2_sgpr3, lanemask(240), implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 ... @@ -1448,7 +1448,7 @@ body: | ; GFX942-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 S_NOP 0, implicit-def dead $agpr0_agpr1 - renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable $agpr0_agpr1_agpr2_agpr3, 240, implicit $exec + renamable $agpr4_agpr5_agpr6_agpr7 = COPY_LANEMASK renamable $agpr0_agpr1_agpr2_agpr3, lanemask(240), implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 ... @@ -1485,7 +1485,7 @@ body: | ; GFX942-NEXT: $agpr6 = V_ACCVGPR_MOV_B32 killed $agpr2, implicit $exec, implicit $exec ; GFX942-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 S_NOP 0, implicit-def dead $agpr0_agpr1 - renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $agpr0_agpr1_agpr2_agpr3, 240, implicit $exec + renamable $agpr4_agpr5_agpr6_agpr7 = COPY_LANEMASK renamable killed $agpr0_agpr1_agpr2_agpr3, lanemask(240), implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 ... diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index 163d7ff9c61fc..bcf15280a7434 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -110,8 +110,9 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s2, s4 ; GFX11-NEXT: s_addc_u32 s3, s3, s5 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -222,8 +223,9 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s2, 0x56789876 ; GFX11-NEXT: s_addc_u32 s3, s3, 0x1234 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -1087,8 +1089,9 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s2, s2, s4 ; GFX11-NEXT: s_subb_u32 s3, s3, s5 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -1199,8 +1202,9 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sub_u32 s2, 0x56789876, s2 ; GFX11-NEXT: s_subb_u32 s3, 0x1234, s3 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -2420,8 +2424,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: s_cmp_ge_u32 s2, s6 ; GFX9-NEXT: s_cselect_b32 s8, s4, s3 ; GFX9-NEXT: .LBB16_3: -; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm @@ -2573,8 +2577,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: s_cmp_ge_u32 s2, s6 ; GFX1010-NEXT: s_cselect_b32 s8, s4, s3 ; GFX1010-NEXT: .LBB16_3: -; GFX1010-NEXT: v_mov_b32_e32 v0, s8 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-NEXT: v_mov_b32_e32 v0, s8 ; GFX1010-NEXT: v_mov_b32_e32 v1, s9 ; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-NEXT: s_endpgm @@ -2726,8 +2730,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: s_cmp_ge_u32 s2, s4 ; GFX1030W32-NEXT: s_cselect_b32 s8, s5, s3 ; GFX1030W32-NEXT: .LBB16_3: -; GFX1030W32-NEXT: v_mov_b32_e32 v0, s8 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX1030W32-NEXT: v_mov_b32_e32 v0, s8 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s9 ; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1030W32-NEXT: s_endpgm @@ -2878,8 +2882,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: s_cmp_ge_u32 s2, s4 ; GFX1030W64-NEXT: s_cselect_b32 s6, s5, s3 ; GFX1030W64-NEXT: .LBB16_3: -; GFX1030W64-NEXT: v_mov_b32_e32 v0, s6 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX1030W64-NEXT: v_mov_b32_e32 v0, s6 ; GFX1030W64-NEXT: v_mov_b32_e32 v1, s7 ; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1030W64-NEXT: s_endpgm @@ -3046,9 +3050,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: s_cmp_ge_u32 s2, s4 ; GFX11-NEXT: s_cselect_b32 s8, s5, s3 ; GFX11-NEXT: .LBB16_3: -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB16_4: diff --git a/llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir b/llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir index 644c8641c606a..c82539c48e6f9 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir +++ b/llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir @@ -17,7 +17,7 @@ body: | ; CHECK-NEXT: $vgpr7 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr7 renamable $vgpr7_vgpr8_vgpr9_vgpr10 = IMPLICIT_DEF - renamable $vgpr7_vgpr8 = COPY killed renamable $vgpr10_vgpr11, 3, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10 + renamable $vgpr7_vgpr8 = COPY_LANEMASK killed renamable $vgpr10_vgpr11, lanemask(3), implicit killed $vgpr7_vgpr8_vgpr9_vgpr10 S_ENDPGM 0, implicit $vgpr7 ... diff --git a/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir index 99a3daa2d05fc..3e987973920e0 100644 --- a/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir +++ b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir @@ -271,7 +271,7 @@ body: | ; GFX1250: liveins: $vgpr3 ; GFX1250-NEXT: {{ $}} ; GFX1250-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec - $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, 12, implicit $exec + $vgpr0_vgpr1 = COPY_LANEMASK killed $vgpr2_vgpr3, lanemask(12), implicit $exec ... --- @@ -304,7 +304,7 @@ body: | ; GFX1250: liveins: $vgpr2 ; GFX1250-NEXT: {{ $}} ; GFX1250-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec - $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, 3, implicit $exec + $vgpr0_vgpr1 = COPY_LANEMASK killed $vgpr2_vgpr3, lanemask(3), implicit $exec ... --- diff --git a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir index 965c31970404f..b0c26724797d0 100644 --- a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir @@ -86,9 +86,9 @@ body: | ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $sgpr64_sgpr65:0x000000000000000F ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99, 4398046511103 + ; CHECK-NEXT: [[COPY_LANEMASK:%[0-9]+]]:vreg_1024 = COPY_LANEMASK renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99, lanemask(0x000003FFFFFFFFFF) ; CHECK-NEXT: renamable $sgpr6 = S_LSHL_B32 renamable $sgpr65, 1, implicit-def dead $scc - ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:vreg_1024 = V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 [[COPY]], 0, killed $sgpr6, 3, implicit-def $m0, implicit $m0, implicit $exec + ; CHECK-NEXT: dead [[COPY_LANEMASK:%[0-9]+]]:vreg_1024 = V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 [[COPY_LANEMASK]], 0, killed $sgpr6, 3, implicit-def $m0, implicit $m0, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.5(0x40000000), %bb.1(0x40000000) @@ -117,7 +117,7 @@ body: | ; CHECK-NEXT: renamable $sgpr55 = COPY renamable $sgpr68 ; CHECK-NEXT: renamable $sgpr56 = COPY renamable $sgpr68 ; CHECK-NEXT: renamable $sgpr57 = COPY killed renamable $sgpr68 - ; CHECK-NEXT: dead [[COPY1:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, 17592186044415, implicit $exec + ; CHECK-NEXT: dead [[COPY_LANEMASK1:%[0-9]+]]:vreg_1024 = COPY_LANEMASK renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, lanemask(0x00000FFFFFFFFFFF), implicit $exec ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, killed renamable $sgpr6_sgpr7, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.1 @@ -129,7 +129,7 @@ body: | ; CHECK-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr6_sgpr7, implicit-def $scc ; CHECK-NEXT: dead renamable $sgpr4 = S_LSHL_B32 killed renamable $sgpr64, 1, implicit-def dead $scc ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 = SI_SPILL_S1024_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: dead [[COPY2:%[0-9]+]]:vreg_1024 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 + ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:vreg_1024 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll index e9834de001321..3804652e2a203 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll @@ -74,7 +74,6 @@ define i32 @dead_i32(i1 %cond, i32 %x, ptr addrspace(1) %ptr1) #0 { ; ASM-GISEL-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; ASM-GISEL-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; ASM-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] -; ASM-DAG: ; %bb.0: ; %entry entry: %dead = call i32 @llvm.amdgcn.dead.i32() br i1 %cond, label %if.then, label %if.end @@ -222,7 +221,7 @@ define %trivial_types @dead_struct(i1 %cond, %trivial_types %x, ptr addrspace(1) ; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v12, v13 :: v_dual_mov_b32 v13, v14 ; ASM-GISEL-FAKE16-NEXT: v_dual_mov_b32 v14, v15 :: v_dual_mov_b32 v15, v16 ; ASM-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] -; ASM-DAG: ; %bb.0: ; %entry + ; ASM-GISEL-LABEL: dead_struct: ; ASM-GISEL: ; %bb.0: ; %entry ; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -517,7 +516,7 @@ define [32 x i32] @dead_array(i1 %cond, [32 x i32] %x, ptr addrspace(1) %ptr1, i ; ASM-GISEL-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; ASM-GISEL-FAKE16-NEXT: s_wait_loadcnt 0x0 ; ASM-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] -; ASM-DAG: ; %bb.0: ; %entry + ; ASM-GISEL-LABEL: dead_array: ; ASM-GISEL: ; %bb.0: ; %entry ; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir index 09526ea5ac878..dd1a26eab49b1 100644 --- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir +++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir @@ -50,7 +50,7 @@ body: | ; CHECK-NEXT: renamable $sgpr56 = S_MOV_B32 0 ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, 12884901888, implicit $exec + ; CHECK-NEXT: [[COPY_LANEMASK:%[0-9]+]]:vreg_1024_align2 = COPY_LANEMASK renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, lanemask(0x0000000300000000), implicit $exec ; CHECK-NEXT: renamable $sgpr100_sgpr101 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: renamable $sgpr57 = S_MOV_B32 1083786240 ; CHECK-NEXT: SI_SPILL_S1024_SAVE renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5) @@ -61,7 +61,7 @@ body: | ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr100_sgpr101, implicit-def dead $scc - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_1024_align2 = COPY [[COPY]] + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY [[COPY_LANEMASK]] ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.17 ; CHECK-NEXT: {{ $}} @@ -136,7 +136,7 @@ body: | ; CHECK-NEXT: renamable $sgpr66 = COPY killed renamable $sgpr84 ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr67 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, implicit $exec + ; CHECK-NEXT: [[COPY_LANEMASK:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, implicit $exec ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.11, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.5 ; CHECK-NEXT: {{ $}} @@ -185,7 +185,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr64_sgpr65 = nofpexcept V_CMP_NLT_F64_e64 0, undef $sgpr4_sgpr5, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec ; CHECK-NEXT: renamable $sgpr66_sgpr67 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_INDIRECT_REG_READ_GPR_IDX_B32_V32_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V32 [[COPY1]], undef $sgpr14, 11, implicit-def $m0, implicit $m0, implicit $exec + ; CHECK-NEXT: dead [[V_INDIRECT_REG_READ_GPR_IDX_B32_V32_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V32 [[COPY]], undef $sgpr14, 11, implicit-def $m0, implicit $m0, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.8: ; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.9(0x40000000) @@ -199,8 +199,8 @@ body: | ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY killed renamable $sgpr84_sgpr85, implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR undef %18:vgpr_32, [[COPY2]], undef renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s64), addrspace 1) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed renamable $sgpr84_sgpr85, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR undef %18:vgpr_32, [[COPY1]], undef renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr18_sgpr19, implicit $exec ; CHECK-NEXT: dead renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec ; CHECK-NEXT: renamable $sgpr82 = S_ADD_U32 renamable $sgpr8, 32, implicit-def dead $scc @@ -221,7 +221,7 @@ body: | ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: $sgpr8_sgpr9 = COPY renamable $sgpr82_sgpr83, 3 + ; CHECK-NEXT: $sgpr8_sgpr9 = COPY_LANEMASK renamable $sgpr82_sgpr83, lanemask(0x0000000000000003) ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr8_sgpr9 ; CHECK-NEXT: renamable $sgpr18_sgpr19 = COPY killed renamable $sgpr48_sgpr49 ; CHECK-NEXT: renamable $sgpr14 = COPY killed renamable $sgpr85 diff --git a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir index cf23a9d1e8a57..6f8e80277a492 100644 --- a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir +++ b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir @@ -45,7 +45,7 @@ body: | ; CHECK-NEXT: $vgpr1 = COPY renamable $sgpr55 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: $vcc = COPY renamable $sgpr34_sgpr35 + ; CHECK-NEXT: $vcc = COPY_LANEMASK renamable $sgpr34_sgpr35, lanemask(0x000000000000000C) ; CHECK-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll index 049ee47af9681..0edfb2503a08c 100644 --- a/llvm/test/CodeGen/X86/shift-i128.ll +++ b/llvm/test/CodeGen/X86/shift-i128.ll @@ -606,18 +606,22 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; i686-NEXT: shll %cl, %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl %ecx, %eax ; i686-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; i686-NEXT: negl %ebx ; i686-NEXT: movl 76(%esp,%ebx), %ebx +; i686-NEXT: movl %eax, %ecx ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; i686-NEXT: shldl %cl, %esi, %ebx ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; i686-NEXT: movl %edi, %esi ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; i686-NEXT: shll %cl, %esi +; i686-NEXT: movl %ecx, %eax ; i686-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; i686-NEXT: negl %edx ; i686-NEXT: movl 108(%esp,%edx), %edx +; i686-NEXT: movl %eax, %ecx ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; i686-NEXT: shldl %cl, %eax, %edx ; i686-NEXT: movl 72(%ebp), %eax diff --git a/llvm/test/CodeGen/X86/swift-return.ll b/llvm/test/CodeGen/X86/swift-return.ll index f9b73d4eaf92c..c39be0beff0c7 100644 --- a/llvm/test/CodeGen/X86/swift-return.ll +++ b/llvm/test/CodeGen/X86/swift-return.ll @@ -412,9 +412,9 @@ define swiftcc { i32, i32, i32, i32 } @gen7(i32 %key) { ; CHECK-LABEL: gen7: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: movl %edi, %edx -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: movl %edi, %r8d +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: movl %eax, %r8d ; CHECK-NEXT: retq ; ; CHECK-O0-LABEL: gen7: @@ -435,9 +435,9 @@ define swiftcc { i64, i64, i64, i64 } @gen8(i64 %key) { ; CHECK-LABEL: gen8: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq %rdi, %rdx -; CHECK-NEXT: movq %rdi, %rcx -; CHECK-NEXT: movq %rdi, %r8 +; CHECK-NEXT: movq %rax, %rdx +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: movq %rax, %r8 ; CHECK-NEXT: retq ; ; CHECK-O0-LABEL: gen8: @@ -483,9 +483,9 @@ define swiftcc { double, double, double, double, i64, i64, i64, i64 } @gen10(dou ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm0, %xmm2 ; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: movq %rdi, %rdx -; CHECK-NEXT: movq %rdi, %rcx -; CHECK-NEXT: movq %rdi, %r8 +; CHECK-NEXT: movq %rax, %rdx +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: movq %rax, %r8 ; CHECK-NEXT: retq ; ; CHECK-O0-LABEL: gen10: