@@ -694,16 +694,8 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
694694 I->clearRegisterKills (DefOp.getReg (), &RI);
695695 }
696696
697- MachineInstrBuilder Builder =
698- BuildMI (MBB, MI, DL, TII.get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
699- .add (DefOp);
700- if (ImpDefSuperReg)
701- Builder.addReg (ImpDefSuperReg, RegState::Define | RegState::Implicit);
702-
703- if (ImpUseSuperReg) {
704- Builder.addReg (ImpUseSuperReg,
705- getKillRegState (KillSrc) | RegState::Implicit);
706- }
697+ BuildMI (MBB, MI, DL, TII.get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
698+ .add (DefOp);
707699
708700 return ;
709701 }
@@ -747,69 +739,74 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
747739
748740 MachineInstrBuilder UseBuilder = BuildMI (MBB, MI, DL, TII.get (TmpCopyOp), Tmp)
749741 .addReg (SrcReg, getKillRegState (KillSrc));
750- if (ImpUseSuperReg) {
751- UseBuilder.addReg (ImpUseSuperReg,
752- getKillRegState (KillSrc) | RegState::Implicit);
753- }
754742
755- MachineInstrBuilder DefBuilder
756- = BuildMI (MBB, MI, DL, TII.get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
757- .addReg (Tmp, RegState::Kill);
758-
759- if (ImpDefSuperReg)
760- DefBuilder.addReg (ImpDefSuperReg, RegState::Define | RegState::Implicit);
743+ BuildMI (MBB, MI, DL, TII.get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
744+ .addReg (Tmp, RegState::Kill);
761745}
762746
763747static void expandSGPRCopy (const SIInstrInfo &TII, MachineBasicBlock &MBB,
764748 MachineBasicBlock::iterator MI, const DebugLoc &DL,
765749 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
766- const TargetRegisterClass *RC, bool Forward) {
750+ const TargetRegisterClass *RC, bool Forward,
751+ uint64_t LiveRegUnitMaskVal) {
767752 const SIRegisterInfo &RI = TII.getRegisterInfo ();
768753 ArrayRef<int16_t > BaseIndices = RI.getRegSplitParts (RC, 4 );
769754 MachineBasicBlock::iterator I = MI;
770- MachineInstr *FirstMI = nullptr , *LastMI = nullptr ;
755+ bool isSrcRegFullLive = LiveRegUnitMaskVal == 0 ;
756+
757+ uint64_t TestMaskVal = 0x0000000000000003 ;
758+ uint8_t ShiftVal = 2 ;
759+
760+ if (!Forward)
761+ TestMaskVal = TestMaskVal << (ShiftVal * (BaseIndices.size () - 1 ));
771762
772763 for (unsigned Idx = 0 ; Idx < BaseIndices.size (); ++Idx) {
773764 int16_t SubIdx = BaseIndices[Idx];
774765 Register DestSubReg = RI.getSubReg (DestReg, SubIdx);
775766 Register SrcSubReg = RI.getSubReg (SrcReg, SubIdx);
776767 assert (DestSubReg && SrcSubReg && " Failed to find subregs!" );
777768 unsigned Opcode = AMDGPU::S_MOV_B32;
769+ bool IsFirstSubreg = Idx == 0 ;
770+
771+ if (!IsFirstSubreg) {
772+ TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal;
773+ }
774+
775+ // Check for liveness of current subregister using TestMaskVal.
776+ if (!isSrcRegFullLive && (LiveRegUnitMaskVal & TestMaskVal) == uint64_t (0 ))
777+ continue ;
778778
779779 // Is SGPR aligned? If so try to combine with next.
780780 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2 ) == 0 ;
781781 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2 ) == 0 ;
782- if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size ())) {
782+ bool isSrc64Live = true ;
783+
784+ if (!isSrcRegFullLive)
785+ isSrc64Live = Forward
786+ ? ((LiveRegUnitMaskVal & (TestMaskVal << ShiftVal)) !=
787+ uint64_t (0 ))
788+ : ((LiveRegUnitMaskVal & (TestMaskVal >> ShiftVal)) !=
789+ uint64_t (0 ));
790+
791+ if (isSrc64Live && AlignedDest && AlignedSrc &&
792+ (Idx + 1 < BaseIndices.size ())) {
783793 // Can use SGPR64 copy
784794 unsigned Channel = RI.getChannelFromSubReg (SubIdx);
785795 SubIdx = RI.getSubRegFromChannel (Channel, 2 );
786796 DestSubReg = RI.getSubReg (DestReg, SubIdx);
787797 SrcSubReg = RI.getSubReg (SrcReg, SubIdx);
788798 assert (DestSubReg && SrcSubReg && " Failed to find subregs!" );
799+ TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal;
789800 Opcode = AMDGPU::S_MOV_B64;
790801 Idx++;
791802 }
792803
793- LastMI = BuildMI (MBB, I, DL, TII.get (Opcode), DestSubReg)
794- .addReg (SrcSubReg)
795- .addReg (SrcReg, RegState::Implicit);
796-
797- if (!FirstMI)
798- FirstMI = LastMI;
804+ BuildMI (MBB, I, DL, TII.get (Opcode), DestSubReg)
805+ .addReg (SrcSubReg, getKillRegState (KillSrc));
799806
800807 if (!Forward)
801808 I--;
802809 }
803-
804- assert (FirstMI && LastMI);
805- if (!Forward)
806- std::swap (FirstMI, LastMI);
807-
808- FirstMI->addOperand (
809- MachineOperand::CreateReg (DestReg, true /* IsDef*/ , true /* IsImp*/ ));
810-
811- if (KillSrc)
812- LastMI->addRegisterKilled (SrcReg, &RI);
813810}
814811
815812void SIInstrInfo::copyPhysReg (MachineBasicBlock &MBB,
@@ -822,6 +819,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
822819 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass (SrcReg);
823820 unsigned SrcSize = RI.getRegSizeInBits (*SrcRC);
824821
822+ uint64_t LiveRegUnitMaskVal = 0 ;
823+ if (MI->getNumOperands () > 2 && MI->getOperand (2 ).isImm ()) {
824+ LiveRegUnitMaskVal = MI->getOperand (2 ).getImm ();
825+ }
826+
827+ bool isSrcRegFullLive = LiveRegUnitMaskVal == 0 ;
828+
825829 // The rest of copyPhysReg assumes Src and Dst size are the same size.
826830 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
827831 // we remove Fix16BitCopies and this code block?
@@ -1043,16 +1047,15 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
10431047 }
10441048 if (ST.hasPkMovB32 ()) {
10451049 BuildMI (MBB, MI, DL, get (AMDGPU::V_PK_MOV_B32), DestReg)
1046- .addImm (SISrcMods::OP_SEL_1)
1047- .addReg (SrcReg)
1048- .addImm (SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1049- .addReg (SrcReg)
1050- .addImm (0 ) // op_sel_lo
1051- .addImm (0 ) // op_sel_hi
1052- .addImm (0 ) // neg_lo
1053- .addImm (0 ) // neg_hi
1054- .addImm (0 ) // clamp
1055- .addReg (SrcReg, getKillRegState (KillSrc) | RegState::Implicit);
1050+ .addImm (SISrcMods::OP_SEL_1)
1051+ .addReg (SrcReg, getKillRegState (KillSrc))
1052+ .addImm (SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1053+ .addReg (SrcReg, getKillRegState (KillSrc))
1054+ .addImm (0 ) // op_sel_lo
1055+ .addImm (0 ) // op_sel_hi
1056+ .addImm (0 ) // neg_lo
1057+ .addImm (0 ) // neg_hi
1058+ .addImm (0 ); // clamp
10561059 return ;
10571060 }
10581061 }
@@ -1065,12 +1068,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
10651068 }
10661069 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap (SrcReg, DestReg);
10671070 expandSGPRCopy (*this , MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1068- Forward);
1071+ Forward, LiveRegUnitMaskVal );
10691072 return ;
10701073 }
10711074
10721075 unsigned EltSize = 4 ;
10731076 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1077+ uint64_t TestMaskVal = 0x0000000000000003 ;
1078+ uint8_t ShiftVal = 2 ;
10741079 if (RI.isAGPRClass (RC)) {
10751080 if (ST.hasGFX90AInsts () && RI.isAGPRClass (SrcRC))
10761081 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
@@ -1085,12 +1090,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
10851090 (RI.isProperlyAlignedRC (*RC) &&
10861091 (SrcRC == RC || RI.isSGPRClass (SrcRC)))) {
10871092 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1093+ // TODO: In case of partial liveness, could do mix of 64-bit and 32-bit
1094+ // moves. Look expandSGPRCopy function for reference.
10881095 if (ST.hasMovB64 ()) {
10891096 Opcode = AMDGPU::V_MOV_B64_e32;
10901097 EltSize = 8 ;
1098+ TestMaskVal = 0x000000000000000F ;
1099+ ShiftVal = 4 ;
10911100 } else if (ST.hasPkMovB32 ()) {
10921101 Opcode = AMDGPU::V_PK_MOV_B32;
10931102 EltSize = 8 ;
1103+ TestMaskVal = 0x000000000000000F ;
1104+ ShiftVal = 4 ;
10941105 }
10951106 }
10961107
@@ -1105,6 +1116,10 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
11051116
11061117 ArrayRef<int16_t > SubIndices = RI.getRegSplitParts (RC, EltSize);
11071118
1119+ // The TestMaskVal will scan from right to left.
1120+ if (!Forward)
1121+ TestMaskVal = TestMaskVal << (ShiftVal * (SubIndices.size () - 1 ));
1122+
11081123 // If there is an overlap, we can't kill the super-register on the last
11091124 // instruction, since it will also kill the components made live by this def.
11101125 const bool Overlap = RI.regsOverlap (SrcReg, DestReg);
@@ -1121,7 +1136,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
11211136 assert (DestSubReg && SrcSubReg && " Failed to find subregs!" );
11221137
11231138 bool IsFirstSubreg = Idx == 0 ;
1124- bool UseKill = CanKillSuperReg && Idx == SubIndices.size () - 1 ;
1139+ bool UseKill = CanKillSuperReg;
1140+
1141+ if (!IsFirstSubreg) {
1142+ TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal;
1143+ }
1144+
1145+ if (!isSrcRegFullLive && (LiveRegUnitMaskVal & TestMaskVal) == uint64_t (0 ))
1146+ continue ;
11251147
11261148 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
11271149 Register ImpDefSuper = IsFirstSubreg ? Register (DestReg) : Register ();
@@ -1132,24 +1154,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
11321154 MachineInstrBuilder MIB =
11331155 BuildMI (MBB, MI, DL, get (AMDGPU::V_PK_MOV_B32), DestSubReg)
11341156 .addImm (SISrcMods::OP_SEL_1)
1135- .addReg (SrcSubReg)
1157+ .addReg (SrcSubReg, getKillRegState (UseKill) )
11361158 .addImm (SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1137- .addReg (SrcSubReg)
1138- .addImm (0 ) // op_sel_lo
1139- .addImm (0 ) // op_sel_hi
1140- .addImm (0 ) // neg_lo
1141- .addImm (0 ) // neg_hi
1142- .addImm (0 ) // clamp
1143- .addReg (SrcReg, getKillRegState (UseKill) | RegState::Implicit);
1144- if (IsFirstSubreg)
1145- MIB.addReg (DestReg, RegState::Define | RegState::Implicit);
1159+ .addReg (SrcSubReg, getKillRegState (UseKill))
1160+ .addImm (0 ) // op_sel_lo
1161+ .addImm (0 ) // op_sel_hi
1162+ .addImm (0 ) // neg_lo
1163+ .addImm (0 ) // neg_hi
1164+ .addImm (0 ); // clamp
11461165 } else {
11471166 MachineInstrBuilder Builder =
1148- BuildMI (MBB, MI, DL, get (Opcode), DestSubReg).addReg (SrcSubReg);
1149- if (IsFirstSubreg)
1150- Builder.addReg (DestReg, RegState::Define | RegState::Implicit);
1151-
1152- Builder.addReg (SrcReg, getKillRegState (UseKill) | RegState::Implicit);
1167+ BuildMI (MBB, MI, DL, get (Opcode), DestSubReg)
1168+ .addReg (SrcSubReg, getKillRegState (UseKill));
11531169 }
11541170 }
11551171}
0 commit comments