Skip to content

Commit 7d6e7fb

Browse files
committed
[AMDGPU][CopyPhysReg] Expand the COPY using the encoded liveness mask.
We will now use the liveness encoded during VirtRegRewriter for COPY instruction to expand only defined registers, thus avoiding the undefined registers. It enables us to stop using implicit and implicit-def avoiding unnecessary false dependency among the registers.
1 parent a1e269d commit 7d6e7fb

File tree

4 files changed

+836
-838
lines changed

4 files changed

+836
-838
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 82 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -691,16 +691,8 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
691691
I->clearRegisterKills(DefOp.getReg(), &RI);
692692
}
693693

694-
MachineInstrBuilder Builder =
695-
BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
696-
.add(DefOp);
697-
if (ImpDefSuperReg)
698-
Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
699-
700-
if (ImpUseSuperReg) {
701-
Builder.addReg(ImpUseSuperReg,
702-
getKillRegState(KillSrc) | RegState::Implicit);
703-
}
694+
BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
695+
.add(DefOp);
704696

705697
return;
706698
}
@@ -744,69 +736,74 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
744736

745737
MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
746738
.addReg(SrcReg, getKillRegState(KillSrc));
747-
if (ImpUseSuperReg) {
748-
UseBuilder.addReg(ImpUseSuperReg,
749-
getKillRegState(KillSrc) | RegState::Implicit);
750-
}
751739

752-
MachineInstrBuilder DefBuilder
753-
= BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
754-
.addReg(Tmp, RegState::Kill);
755-
756-
if (ImpDefSuperReg)
757-
DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
740+
BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
741+
.addReg(Tmp, RegState::Kill);
758742
}
759743

760744
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
761745
MachineBasicBlock::iterator MI, const DebugLoc &DL,
762746
MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
763-
const TargetRegisterClass *RC, bool Forward) {
747+
const TargetRegisterClass *RC, bool Forward,
748+
uint64_t LiveRegUnitMaskVal) {
764749
const SIRegisterInfo &RI = TII.getRegisterInfo();
765750
ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
766751
MachineBasicBlock::iterator I = MI;
767-
MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
752+
bool isSrcRegFullLive = LiveRegUnitMaskVal == 0;
753+
754+
uint64_t TestMaskVal = 0x0000000000000003;
755+
uint8_t ShiftVal = 2;
756+
757+
if (!Forward)
758+
TestMaskVal = TestMaskVal << (ShiftVal * (BaseIndices.size() - 1));
768759

769760
for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
770761
int16_t SubIdx = BaseIndices[Idx];
771762
Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
772763
Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
773764
assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
774765
unsigned Opcode = AMDGPU::S_MOV_B32;
766+
bool IsFirstSubreg = Idx == 0;
767+
768+
if (!IsFirstSubreg) {
769+
TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal;
770+
}
771+
772+
// Check for liveness of current subregister using TestMaskVal.
773+
if (!isSrcRegFullLive && (LiveRegUnitMaskVal & TestMaskVal) == uint64_t(0))
774+
continue;
775775

776776
// Is SGPR aligned? If so try to combine with next.
777777
bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
778778
bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
779-
if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
779+
bool isSrc64Live = true;
780+
781+
if (!isSrcRegFullLive)
782+
isSrc64Live = Forward
783+
? ((LiveRegUnitMaskVal & (TestMaskVal << ShiftVal)) !=
784+
uint64_t(0))
785+
: ((LiveRegUnitMaskVal & (TestMaskVal >> ShiftVal)) !=
786+
uint64_t(0));
787+
788+
if (isSrc64Live && AlignedDest && AlignedSrc &&
789+
(Idx + 1 < BaseIndices.size())) {
780790
// Can use SGPR64 copy
781791
unsigned Channel = RI.getChannelFromSubReg(SubIdx);
782792
SubIdx = RI.getSubRegFromChannel(Channel, 2);
783793
DestSubReg = RI.getSubReg(DestReg, SubIdx);
784794
SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
785795
assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
796+
TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal;
786797
Opcode = AMDGPU::S_MOV_B64;
787798
Idx++;
788799
}
789800

790-
LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
791-
.addReg(SrcSubReg)
792-
.addReg(SrcReg, RegState::Implicit);
793-
794-
if (!FirstMI)
795-
FirstMI = LastMI;
801+
BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
802+
.addReg(SrcSubReg, getKillRegState(KillSrc));
796803

797804
if (!Forward)
798805
I--;
799806
}
800-
801-
assert(FirstMI && LastMI);
802-
if (!Forward)
803-
std::swap(FirstMI, LastMI);
804-
805-
FirstMI->addOperand(
806-
MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
807-
808-
if (KillSrc)
809-
LastMI->addRegisterKilled(SrcReg, &RI);
810807
}
811808

812809
void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
@@ -819,6 +816,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
819816
const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
820817
unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
821818

819+
uint64_t LiveRegUnitMaskVal = 0;
820+
if (MI->getNumOperands() > 2 && MI->getOperand(2).isImm()) {
821+
LiveRegUnitMaskVal = MI->getOperand(2).getImm();
822+
}
823+
824+
bool isSrcRegFullLive = LiveRegUnitMaskVal == 0;
825+
822826
// The rest of copyPhysReg assumes Src and Dst size are the same size.
823827
// TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
824828
// we remove Fix16BitCopies and this code block?
@@ -1052,16 +1056,15 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
10521056
}
10531057
if (ST.hasPkMovB32()) {
10541058
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1055-
.addImm(SISrcMods::OP_SEL_1)
1056-
.addReg(SrcReg)
1057-
.addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1058-
.addReg(SrcReg)
1059-
.addImm(0) // op_sel_lo
1060-
.addImm(0) // op_sel_hi
1061-
.addImm(0) // neg_lo
1062-
.addImm(0) // neg_hi
1063-
.addImm(0) // clamp
1064-
.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1059+
.addImm(SISrcMods::OP_SEL_1)
1060+
.addReg(SrcReg, getKillRegState(KillSrc))
1061+
.addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1062+
.addReg(SrcReg, getKillRegState(KillSrc))
1063+
.addImm(0) // op_sel_lo
1064+
.addImm(0) // op_sel_hi
1065+
.addImm(0) // neg_lo
1066+
.addImm(0) // neg_hi
1067+
.addImm(0); // clamp
10651068
return;
10661069
}
10671070
}
@@ -1074,12 +1077,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
10741077
}
10751078
const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
10761079
expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1077-
Forward);
1080+
Forward, LiveRegUnitMaskVal);
10781081
return;
10791082
}
10801083

10811084
unsigned EltSize = 4;
10821085
unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1086+
uint64_t TestMaskVal = 0x0000000000000003;
1087+
uint8_t ShiftVal = 2;
10831088
if (RI.isAGPRClass(RC)) {
10841089
if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
10851090
Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
@@ -1094,12 +1099,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
10941099
(RI.isProperlyAlignedRC(*RC) &&
10951100
(SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
10961101
// TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1102+
// TODO: In case of partial liveness, could do mix of 64-bit and 32-bit
1103+
// moves. Look expandSGPRCopy function for reference.
10971104
if (ST.hasMovB64()) {
10981105
Opcode = AMDGPU::V_MOV_B64_e32;
10991106
EltSize = 8;
1107+
TestMaskVal = 0x000000000000000F;
1108+
ShiftVal = 4;
11001109
} else if (ST.hasPkMovB32()) {
11011110
Opcode = AMDGPU::V_PK_MOV_B32;
11021111
EltSize = 8;
1112+
TestMaskVal = 0x000000000000000F;
1113+
ShiftVal = 4;
11031114
}
11041115
}
11051116

@@ -1114,6 +1125,10 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
11141125

11151126
ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
11161127

1128+
// The TestMaskVal will scan from right to left.
1129+
if (!Forward)
1130+
TestMaskVal = TestMaskVal << (ShiftVal * (SubIndices.size() - 1));
1131+
11171132
// If there is an overlap, we can't kill the super-register on the last
11181133
// instruction, since it will also kill the components made live by this def.
11191134
const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
@@ -1130,7 +1145,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
11301145
assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
11311146

11321147
bool IsFirstSubreg = Idx == 0;
1133-
bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1148+
bool UseKill = CanKillSuperReg;
1149+
1150+
if (!IsFirstSubreg) {
1151+
TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal;
1152+
}
1153+
1154+
if (!isSrcRegFullLive && (LiveRegUnitMaskVal & TestMaskVal) == uint64_t(0))
1155+
continue;
11341156

11351157
if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
11361158
Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
@@ -1141,24 +1163,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
11411163
MachineInstrBuilder MIB =
11421164
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
11431165
.addImm(SISrcMods::OP_SEL_1)
1144-
.addReg(SrcSubReg)
1166+
.addReg(SrcSubReg, getKillRegState(UseKill))
11451167
.addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
1146-
.addReg(SrcSubReg)
1147-
.addImm(0) // op_sel_lo
1148-
.addImm(0) // op_sel_hi
1149-
.addImm(0) // neg_lo
1150-
.addImm(0) // neg_hi
1151-
.addImm(0) // clamp
1152-
.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1153-
if (IsFirstSubreg)
1154-
MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
1168+
.addReg(SrcSubReg, getKillRegState(UseKill))
1169+
.addImm(0) // op_sel_lo
1170+
.addImm(0) // op_sel_hi
1171+
.addImm(0) // neg_lo
1172+
.addImm(0) // neg_hi
1173+
.addImm(0); // clamp
11551174
} else {
11561175
MachineInstrBuilder Builder =
1157-
BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1158-
if (IsFirstSubreg)
1159-
Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1160-
1161-
Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1176+
BuildMI(MBB, MI, DL, get(Opcode), DestSubReg)
1177+
.addReg(SrcSubReg, getKillRegState(UseKill));
11621178
}
11631179
}
11641180
}

0 commit comments

Comments
 (0)