Skip to content

Commit e14f6a3

Browse files
committed
[AMDGPU][CopyPhysReg] Expand the COPY using the encoded liveness mask.
We will now use the liveness encoded during VirtRegRewriter for COPY instruction to expand only defined registers, thus avoiding the undefined registers. It enables us to stop using implicit and implicit-def avoiding unnecessary false dependency among the registers.
1 parent 0a6cd82 commit e14f6a3

File tree

4 files changed

+836
-838
lines changed

4 files changed

+836
-838
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 82 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -694,16 +694,8 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
694694
I->clearRegisterKills(DefOp.getReg(), &RI);
695695
}
696696

697-
MachineInstrBuilder Builder =
698-
BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
699-
.add(DefOp);
700-
if (ImpDefSuperReg)
701-
Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
702-
703-
if (ImpUseSuperReg) {
704-
Builder.addReg(ImpUseSuperReg,
705-
getKillRegState(KillSrc) | RegState::Implicit);
706-
}
697+
BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
698+
.add(DefOp);
707699

708700
return;
709701
}
@@ -747,69 +739,74 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
747739

748740
MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
749741
.addReg(SrcReg, getKillRegState(KillSrc));
750-
if (ImpUseSuperReg) {
751-
UseBuilder.addReg(ImpUseSuperReg,
752-
getKillRegState(KillSrc) | RegState::Implicit);
753-
}
754742

755-
MachineInstrBuilder DefBuilder
756-
= BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
757-
.addReg(Tmp, RegState::Kill);
758-
759-
if (ImpDefSuperReg)
760-
DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
743+
BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
744+
.addReg(Tmp, RegState::Kill);
761745
}
762746

763747
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
764748
MachineBasicBlock::iterator MI, const DebugLoc &DL,
765749
MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
766-
const TargetRegisterClass *RC, bool Forward) {
750+
const TargetRegisterClass *RC, bool Forward,
751+
uint64_t LiveRegUnitMaskVal) {
767752
const SIRegisterInfo &RI = TII.getRegisterInfo();
768753
ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
769754
MachineBasicBlock::iterator I = MI;
770-
MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
755+
bool isSrcRegFullLive = LiveRegUnitMaskVal == 0;
756+
757+
uint64_t TestMaskVal = 0x0000000000000003;
758+
uint8_t ShiftVal = 2;
759+
760+
if (!Forward)
761+
TestMaskVal = TestMaskVal << (ShiftVal * (BaseIndices.size() - 1));
771762

772763
for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
773764
int16_t SubIdx = BaseIndices[Idx];
774765
Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
775766
Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
776767
assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
777768
unsigned Opcode = AMDGPU::S_MOV_B32;
769+
bool IsFirstSubreg = Idx == 0;
770+
771+
if (!IsFirstSubreg) {
772+
TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal;
773+
}
774+
775+
// Check for liveness of current subregister using TestMaskVal.
776+
if (!isSrcRegFullLive && (LiveRegUnitMaskVal & TestMaskVal) == uint64_t(0))
777+
continue;
778778

779779
// Is SGPR aligned? If so try to combine with next.
780780
bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
781781
bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
782-
if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
782+
bool isSrc64Live = true;
783+
784+
if (!isSrcRegFullLive)
785+
isSrc64Live = Forward
786+
? ((LiveRegUnitMaskVal & (TestMaskVal << ShiftVal)) !=
787+
uint64_t(0))
788+
: ((LiveRegUnitMaskVal & (TestMaskVal >> ShiftVal)) !=
789+
uint64_t(0));
790+
791+
if (isSrc64Live && AlignedDest && AlignedSrc &&
792+
(Idx + 1 < BaseIndices.size())) {
783793
// Can use SGPR64 copy
784794
unsigned Channel = RI.getChannelFromSubReg(SubIdx);
785795
SubIdx = RI.getSubRegFromChannel(Channel, 2);
786796
DestSubReg = RI.getSubReg(DestReg, SubIdx);
787797
SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
788798
assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
799+
TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal;
789800
Opcode = AMDGPU::S_MOV_B64;
790801
Idx++;
791802
}
792803

793-
LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
794-
.addReg(SrcSubReg)
795-
.addReg(SrcReg, RegState::Implicit);
796-
797-
if (!FirstMI)
798-
FirstMI = LastMI;
804+
BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
805+
.addReg(SrcSubReg, getKillRegState(KillSrc));
799806

800807
if (!Forward)
801808
I--;
802809
}
803-
804-
assert(FirstMI && LastMI);
805-
if (!Forward)
806-
std::swap(FirstMI, LastMI);
807-
808-
FirstMI->addOperand(
809-
MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
810-
811-
if (KillSrc)
812-
LastMI->addRegisterKilled(SrcReg, &RI);
813810
}
814811

815812
void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
@@ -822,6 +819,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
822819
const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
823820
unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
824821

822+
uint64_t LiveRegUnitMaskVal = 0;
823+
if (MI->getNumOperands() > 2 && MI->getOperand(2).isImm()) {
824+
LiveRegUnitMaskVal = MI->getOperand(2).getImm();
825+
}
826+
827+
bool isSrcRegFullLive = LiveRegUnitMaskVal == 0;
828+
825829
// The rest of copyPhysReg assumes Src and Dst size are the same size.
826830
// TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
827831
// we remove Fix16BitCopies and this code block?
@@ -1043,16 +1047,15 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
10431047
}
10441048
if (ST.hasPkMovB32()) {
10451049
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1046-
.addImm(SISrcMods::OP_SEL_1)
1047-
.addReg(SrcReg)
1048-
.addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1049-
.addReg(SrcReg)
1050-
.addImm(0) // op_sel_lo
1051-
.addImm(0) // op_sel_hi
1052-
.addImm(0) // neg_lo
1053-
.addImm(0) // neg_hi
1054-
.addImm(0) // clamp
1055-
.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1050+
.addImm(SISrcMods::OP_SEL_1)
1051+
.addReg(SrcReg, getKillRegState(KillSrc))
1052+
.addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1053+
.addReg(SrcReg, getKillRegState(KillSrc))
1054+
.addImm(0) // op_sel_lo
1055+
.addImm(0) // op_sel_hi
1056+
.addImm(0) // neg_lo
1057+
.addImm(0) // neg_hi
1058+
.addImm(0); // clamp
10561059
return;
10571060
}
10581061
}
@@ -1065,12 +1068,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
10651068
}
10661069
const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
10671070
expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1068-
Forward);
1071+
Forward, LiveRegUnitMaskVal);
10691072
return;
10701073
}
10711074

10721075
unsigned EltSize = 4;
10731076
unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1077+
uint64_t TestMaskVal = 0x0000000000000003;
1078+
uint8_t ShiftVal = 2;
10741079
if (RI.isAGPRClass(RC)) {
10751080
if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
10761081
Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
@@ -1085,12 +1090,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
10851090
(RI.isProperlyAlignedRC(*RC) &&
10861091
(SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
10871092
// TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1093+
// TODO: In case of partial liveness, could do mix of 64-bit and 32-bit
1094+
// moves. Look expandSGPRCopy function for reference.
10881095
if (ST.hasMovB64()) {
10891096
Opcode = AMDGPU::V_MOV_B64_e32;
10901097
EltSize = 8;
1098+
TestMaskVal = 0x000000000000000F;
1099+
ShiftVal = 4;
10911100
} else if (ST.hasPkMovB32()) {
10921101
Opcode = AMDGPU::V_PK_MOV_B32;
10931102
EltSize = 8;
1103+
TestMaskVal = 0x000000000000000F;
1104+
ShiftVal = 4;
10941105
}
10951106
}
10961107

@@ -1105,6 +1116,10 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
11051116

11061117
ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
11071118

1119+
// The TestMaskVal will scan from right to left.
1120+
if (!Forward)
1121+
TestMaskVal = TestMaskVal << (ShiftVal * (SubIndices.size() - 1));
1122+
11081123
// If there is an overlap, we can't kill the super-register on the last
11091124
// instruction, since it will also kill the components made live by this def.
11101125
const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
@@ -1121,7 +1136,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
11211136
assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
11221137

11231138
bool IsFirstSubreg = Idx == 0;
1124-
bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1139+
bool UseKill = CanKillSuperReg;
1140+
1141+
if (!IsFirstSubreg) {
1142+
TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal;
1143+
}
1144+
1145+
if (!isSrcRegFullLive && (LiveRegUnitMaskVal & TestMaskVal) == uint64_t(0))
1146+
continue;
11251147

11261148
if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
11271149
Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
@@ -1132,24 +1154,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
11321154
MachineInstrBuilder MIB =
11331155
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
11341156
.addImm(SISrcMods::OP_SEL_1)
1135-
.addReg(SrcSubReg)
1157+
.addReg(SrcSubReg, getKillRegState(UseKill))
11361158
.addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1137-
.addReg(SrcSubReg)
1138-
.addImm(0) // op_sel_lo
1139-
.addImm(0) // op_sel_hi
1140-
.addImm(0) // neg_lo
1141-
.addImm(0) // neg_hi
1142-
.addImm(0) // clamp
1143-
.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1144-
if (IsFirstSubreg)
1145-
MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
1159+
.addReg(SrcSubReg, getKillRegState(UseKill))
1160+
.addImm(0) // op_sel_lo
1161+
.addImm(0) // op_sel_hi
1162+
.addImm(0) // neg_lo
1163+
.addImm(0) // neg_hi
1164+
.addImm(0); // clamp
11461165
} else {
11471166
MachineInstrBuilder Builder =
1148-
BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1149-
if (IsFirstSubreg)
1150-
Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1151-
1152-
Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1167+
BuildMI(MBB, MI, DL, get(Opcode), DestSubReg)
1168+
.addReg(SrcSubReg, getKillRegState(UseKill));
11531169
}
11541170
}
11551171
}

0 commit comments

Comments
 (0)