@@ -691,16 +691,8 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
691691 I->clearRegisterKills (DefOp.getReg (), &RI);
692692 }
693693
694- MachineInstrBuilder Builder =
695- BuildMI (MBB, MI, DL, TII.get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
696- .add (DefOp);
697- if (ImpDefSuperReg)
698- Builder.addReg (ImpDefSuperReg, RegState::Define | RegState::Implicit);
699-
700- if (ImpUseSuperReg) {
701- Builder.addReg (ImpUseSuperReg,
702- getKillRegState (KillSrc) | RegState::Implicit);
703- }
694+ BuildMI (MBB, MI, DL, TII.get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
695+ .add (DefOp);
704696
705697 return ;
706698 }
@@ -744,69 +736,74 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
744736
745737 MachineInstrBuilder UseBuilder = BuildMI (MBB, MI, DL, TII.get (TmpCopyOp), Tmp)
746738 .addReg (SrcReg, getKillRegState (KillSrc));
747- if (ImpUseSuperReg) {
748- UseBuilder.addReg (ImpUseSuperReg,
749- getKillRegState (KillSrc) | RegState::Implicit);
750- }
751739
752- MachineInstrBuilder DefBuilder
753- = BuildMI (MBB, MI, DL, TII.get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
754- .addReg (Tmp, RegState::Kill);
755-
756- if (ImpDefSuperReg)
757- DefBuilder.addReg (ImpDefSuperReg, RegState::Define | RegState::Implicit);
740+ BuildMI (MBB, MI, DL, TII.get (AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
741+ .addReg (Tmp, RegState::Kill);
758742}
759743
760744static void expandSGPRCopy (const SIInstrInfo &TII, MachineBasicBlock &MBB,
761745 MachineBasicBlock::iterator MI, const DebugLoc &DL,
762746 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
763- const TargetRegisterClass *RC, bool Forward) {
747+ const TargetRegisterClass *RC, bool Forward,
748+ uint64_t LiveRegUnitMaskVal) {
764749 const SIRegisterInfo &RI = TII.getRegisterInfo ();
765750 ArrayRef<int16_t > BaseIndices = RI.getRegSplitParts (RC, 4 );
766751 MachineBasicBlock::iterator I = MI;
767- MachineInstr *FirstMI = nullptr , *LastMI = nullptr ;
752+ bool isSrcRegFullLive = LiveRegUnitMaskVal == 0 ;
753+
754+ uint64_t TestMaskVal = 0x0000000000000003 ;
755+ uint8_t ShiftVal = 2 ;
756+
757+ if (!Forward)
758+ TestMaskVal = TestMaskVal << (ShiftVal * (BaseIndices.size () - 1 ));
768759
769760 for (unsigned Idx = 0 ; Idx < BaseIndices.size (); ++Idx) {
770761 int16_t SubIdx = BaseIndices[Idx];
771762 Register DestSubReg = RI.getSubReg (DestReg, SubIdx);
772763 Register SrcSubReg = RI.getSubReg (SrcReg, SubIdx);
773764 assert (DestSubReg && SrcSubReg && " Failed to find subregs!" );
774765 unsigned Opcode = AMDGPU::S_MOV_B32;
766+ bool IsFirstSubreg = Idx == 0 ;
767+
768+ if (!IsFirstSubreg) {
769+ TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal;
770+ }
771+
772+ // Check for liveness of current subregister using TestMaskVal.
773+ if (!isSrcRegFullLive && (LiveRegUnitMaskVal & TestMaskVal) == uint64_t (0 ))
774+ continue ;
775775
776776 // Is SGPR aligned? If so try to combine with next.
777777 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2 ) == 0 ;
778778 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2 ) == 0 ;
779- if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size ())) {
779+ bool isSrc64Live = true ;
780+
781+ if (!isSrcRegFullLive)
782+ isSrc64Live = Forward
783+ ? ((LiveRegUnitMaskVal & (TestMaskVal << ShiftVal)) !=
784+ uint64_t (0 ))
785+ : ((LiveRegUnitMaskVal & (TestMaskVal >> ShiftVal)) !=
786+ uint64_t (0 ));
787+
788+ if (isSrc64Live && AlignedDest && AlignedSrc &&
789+ (Idx + 1 < BaseIndices.size ())) {
780790 // Can use SGPR64 copy
781791 unsigned Channel = RI.getChannelFromSubReg (SubIdx);
782792 SubIdx = RI.getSubRegFromChannel (Channel, 2 );
783793 DestSubReg = RI.getSubReg (DestReg, SubIdx);
784794 SrcSubReg = RI.getSubReg (SrcReg, SubIdx);
785795 assert (DestSubReg && SrcSubReg && " Failed to find subregs!" );
796+ TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal;
786797 Opcode = AMDGPU::S_MOV_B64;
787798 Idx++;
788799 }
789800
790- LastMI = BuildMI (MBB, I, DL, TII.get (Opcode), DestSubReg)
791- .addReg (SrcSubReg)
792- .addReg (SrcReg, RegState::Implicit);
793-
794- if (!FirstMI)
795- FirstMI = LastMI;
801+ BuildMI (MBB, I, DL, TII.get (Opcode), DestSubReg)
802+ .addReg (SrcSubReg, getKillRegState (KillSrc));
796803
797804 if (!Forward)
798805 I--;
799806 }
800-
801- assert (FirstMI && LastMI);
802- if (!Forward)
803- std::swap (FirstMI, LastMI);
804-
805- FirstMI->addOperand (
806- MachineOperand::CreateReg (DestReg, true /* IsDef*/ , true /* IsImp*/ ));
807-
808- if (KillSrc)
809- LastMI->addRegisterKilled (SrcReg, &RI);
810807}
811808
812809void SIInstrInfo::copyPhysReg (MachineBasicBlock &MBB,
@@ -819,6 +816,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
819816 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass (SrcReg);
820817 unsigned SrcSize = RI.getRegSizeInBits (*SrcRC);
821818
819+ uint64_t LiveRegUnitMaskVal = 0 ;
820+ if (MI->getNumOperands () > 2 && MI->getOperand (2 ).isImm ()) {
821+ LiveRegUnitMaskVal = MI->getOperand (2 ).getImm ();
822+ }
823+
824+ bool isSrcRegFullLive = LiveRegUnitMaskVal == 0 ;
825+
822826 // The rest of copyPhysReg assumes Src and Dst size are the same size.
823827 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
824828 // we remove Fix16BitCopies and this code block?
@@ -1052,16 +1056,15 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
10521056 }
10531057 if (ST.hasPkMovB32 ()) {
10541058 BuildMI (MBB, MI, DL, get (AMDGPU::V_PK_MOV_B32), DestReg)
1055- .addImm (SISrcMods::OP_SEL_1)
1056- .addReg (SrcReg)
1057- .addImm (SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1058- .addReg (SrcReg)
1059- .addImm (0 ) // op_sel_lo
1060- .addImm (0 ) // op_sel_hi
1061- .addImm (0 ) // neg_lo
1062- .addImm (0 ) // neg_hi
1063- .addImm (0 ) // clamp
1064- .addReg (SrcReg, getKillRegState (KillSrc) | RegState::Implicit);
1059+ .addImm (SISrcMods::OP_SEL_1)
1060+ .addReg (SrcReg, getKillRegState (KillSrc))
1061+ .addImm (SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1062+ .addReg (SrcReg, getKillRegState (KillSrc))
1063+ .addImm (0 ) // op_sel_lo
1064+ .addImm (0 ) // op_sel_hi
1065+ .addImm (0 ) // neg_lo
1066+ .addImm (0 ) // neg_hi
1067+ .addImm (0 ); // clamp
10651068 return ;
10661069 }
10671070 }
@@ -1074,12 +1077,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
10741077 }
10751078 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap (SrcReg, DestReg);
10761079 expandSGPRCopy (*this , MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1077- Forward);
1080+ Forward, LiveRegUnitMaskVal );
10781081 return ;
10791082 }
10801083
10811084 unsigned EltSize = 4 ;
10821085 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1086+ uint64_t TestMaskVal = 0x0000000000000003 ;
1087+ uint8_t ShiftVal = 2 ;
10831088 if (RI.isAGPRClass (RC)) {
10841089 if (ST.hasGFX90AInsts () && RI.isAGPRClass (SrcRC))
10851090 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
@@ -1094,12 +1099,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
10941099 (RI.isProperlyAlignedRC (*RC) &&
10951100 (SrcRC == RC || RI.isSGPRClass (SrcRC)))) {
10961101 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1102+ // TODO: In case of partial liveness, could do mix of 64-bit and 32-bit
1103+ // moves. Look expandSGPRCopy function for reference.
10971104 if (ST.hasMovB64 ()) {
10981105 Opcode = AMDGPU::V_MOV_B64_e32;
10991106 EltSize = 8 ;
1107+ TestMaskVal = 0x000000000000000F ;
1108+ ShiftVal = 4 ;
11001109 } else if (ST.hasPkMovB32 ()) {
11011110 Opcode = AMDGPU::V_PK_MOV_B32;
11021111 EltSize = 8 ;
1112+ TestMaskVal = 0x000000000000000F ;
1113+ ShiftVal = 4 ;
11031114 }
11041115 }
11051116
@@ -1114,6 +1125,10 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
11141125
11151126 ArrayRef<int16_t > SubIndices = RI.getRegSplitParts (RC, EltSize);
11161127
1128+ // The TestMaskVal will scan from right to left.
1129+ if (!Forward)
1130+ TestMaskVal = TestMaskVal << (ShiftVal * (SubIndices.size () - 1 ));
1131+
11171132 // If there is an overlap, we can't kill the super-register on the last
11181133 // instruction, since it will also kill the components made live by this def.
11191134 const bool Overlap = RI.regsOverlap (SrcReg, DestReg);
@@ -1130,7 +1145,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
11301145 assert (DestSubReg && SrcSubReg && " Failed to find subregs!" );
11311146
11321147 bool IsFirstSubreg = Idx == 0 ;
1133- bool UseKill = CanKillSuperReg && Idx == SubIndices.size () - 1 ;
1148+ bool UseKill = CanKillSuperReg;
1149+
1150+ if (!IsFirstSubreg) {
1151+ TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal;
1152+ }
1153+
1154+ if (!isSrcRegFullLive && (LiveRegUnitMaskVal & TestMaskVal) == uint64_t (0 ))
1155+ continue ;
11341156
11351157 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
11361158 Register ImpDefSuper = IsFirstSubreg ? Register (DestReg) : Register ();
@@ -1141,24 +1163,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
11411163 MachineInstrBuilder MIB =
11421164 BuildMI (MBB, MI, DL, get (AMDGPU::V_PK_MOV_B32), DestSubReg)
11431165 .addImm (SISrcMods::OP_SEL_1)
1144- .addReg (SrcSubReg)
1166+ .addReg (SrcSubReg, getKillRegState (UseKill) )
11451167 .addImm (SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1146- .addReg (SrcSubReg)
1147- .addImm (0 ) // op_sel_lo
1148- .addImm (0 ) // op_sel_hi
1149- .addImm (0 ) // neg_lo
1150- .addImm (0 ) // neg_hi
1151- .addImm (0 ) // clamp
1152- .addReg (SrcReg, getKillRegState (UseKill) | RegState::Implicit);
1153- if (IsFirstSubreg)
1154- MIB.addReg (DestReg, RegState::Define | RegState::Implicit);
1168+ .addReg (SrcSubReg, getKillRegState (UseKill))
1169+ .addImm (0 ) // op_sel_lo
1170+ .addImm (0 ) // op_sel_hi
1171+ .addImm (0 ) // neg_lo
1172+ .addImm (0 ) // neg_hi
1173+ .addImm (0 ); // clamp
11551174 } else {
11561175 MachineInstrBuilder Builder =
1157- BuildMI (MBB, MI, DL, get (Opcode), DestSubReg).addReg (SrcSubReg);
1158- if (IsFirstSubreg)
1159- Builder.addReg (DestReg, RegState::Define | RegState::Implicit);
1160-
1161- Builder.addReg (SrcReg, getKillRegState (UseKill) | RegState::Implicit);
1176+ BuildMI (MBB, MI, DL, get (Opcode), DestSubReg)
1177+ .addReg (SrcSubReg, getKillRegState (UseKill));
11621178 }
11631179 }
11641180}
0 commit comments