Skip to content

Commit 854a465

Browse files
committed
add subreg case
1 parent 1807334 commit 854a465

File tree

5 files changed

+113
-125
lines changed

5 files changed

+113
-125
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 61 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -7527,6 +7527,11 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
75277527
return;
75287528

75297529
unsigned Opcode = MI.getOpcode();
7530+
if (Opcode == AMDGPU::REG_SEQUENCE) {
7531+
legalizeSpecialInst_t16(MI, MRI);
7532+
return;
7533+
}
7534+
75307535
MachineBasicBlock *MBB = MI.getParent();
75317536
// Legalize operands and check for size mismatch
75327537
if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
@@ -7565,50 +7570,63 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
75657570
legalizeOperandsVALUt16(MI, OpIdx, MRI);
75667571
}
75677572

7568-
// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7569-
// lowering (lower the copy itself). Including cases:
7570-
// 1. sreg32 = copy vgpr16 => vgpr32 = REG_SEQUENCE(vgpr16, lo16)
7571-
// 2. sreg32 = copy .lo16:vgpr32 / sreg32 = copy .hi16:vgpr32
7572-
// => vgpr16 = copy .hi/lo16:vgpr32
7573-
// vgpr32 = REG_SEQUENCE(vgpr16, lo16)
7573+
// Legalize operands of size-mismatches special inst between 16bit and 32bit
7574+
// in moveToVALU lowering in true16 mode. This caused by 16bit
7575+
// placed in both vgpr16 and sreg32 by isel. Including cases:
7576+
// Copy
7577+
// 1. dst32 = copy vgpr16 => dst32 = REG_SEQUENCE(vgpr16, lo16)
7578+
// 2. dst32 = copy .lo16:vgpr32 / dst32 = copy .hi16:vgpr32
7579+
// => dst32 = REG_SEQUENCE(.lo16/hi16:vgpr32, lo16)
75747580
// 3. sgpr16 = copy vgpr32/... (skipped, isel do not generate sgpr16)
7581+
//
7582+
// Reg_sequence
7583+
// dst32 = reg_sequence(vgpr32, lo16/hi16)
7584+
// => dst32 = reg_sequence(.lo16:vgpr32, lo16/hi16)
7585+
//
75757586
// This can be removed after we have sgpr16 in place.
7576-
bool SIInstrInfo::legalizeV2SCopyt16(MachineInstr &Copy,
7577-
MachineRegisterInfo &MRI,
7578-
SIInstrWorklist &Worklist) const {
7579-
Register DstReg = Copy.getOperand(0).getReg();
7580-
Register SrcReg = Copy.getOperand(1).getReg();
7581-
Register SrcSubReg = Copy.getOperand(1).getSubReg();
7582-
const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Copy);
7583-
const TargetRegisterClass *SrcRegRC = getOpRegClass(Copy, 1);
7584-
bool KeepCopy;
7585-
7586-
if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
7587-
KeepCopy = 0;
7588-
} else if (NewDstRC == &AMDGPU::VGPR_32RegClass &&
7589-
(SrcSubReg == AMDGPU::hi16 || SrcSubReg == AMDGPU::lo16)) {
7590-
KeepCopy = 1;
7591-
Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7592-
Copy.getOperand(0).setReg(NewDstReg);
7593-
SrcReg = NewDstReg;
7594-
} else
7595-
return false;
7587+
void SIInstrInfo::legalizeSpecialInst_t16(MachineInstr &Inst,
7588+
MachineRegisterInfo &MRI) const {
7589+
unsigned Opcode = Inst.getOpcode();
7590+
const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
7591+
switch (Opcode) {
7592+
case AMDGPU::COPY: {
7593+
Register SrcReg = Inst.getOperand(1).getReg();
7594+
if (!SrcReg.isVirtual() || !RI.isVGPR(MRI, SrcReg))
7595+
return;
75967596

7597-
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7598-
Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7599-
BuildMI(*Copy.getParent(), &Copy, Copy.getDebugLoc(),
7600-
get(AMDGPU::IMPLICIT_DEF), Undef);
7601-
BuildMI(*Copy.getParent(), std::next(Copy.getIterator()), Copy.getDebugLoc(),
7602-
get(AMDGPU::REG_SEQUENCE), NewDstReg)
7603-
.addReg(SrcReg)
7604-
.addImm(AMDGPU::lo16)
7605-
.addReg(Undef)
7606-
.addImm(AMDGPU::hi16);
7607-
if (!KeepCopy)
7608-
Copy.eraseFromParent();
7609-
MRI.replaceRegWith(DstReg, NewDstReg);
7610-
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7611-
return true;
7597+
bool SetSubReg = false;
7598+
Register SrcSubReg = Inst.getOperand(1).getSubReg();
7599+
const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
7600+
if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
7601+
} else if (NewDstRC == &AMDGPU::VGPR_32RegClass &&
7602+
(SrcSubReg == AMDGPU::hi16 || SrcSubReg == AMDGPU::lo16)) {
7603+
SetSubReg = true;
7604+
} else
7605+
return;
7606+
7607+
Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7608+
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7609+
get(AMDGPU::IMPLICIT_DEF), Undef);
7610+
Inst.setDesc(get(AMDGPU::REG_SEQUENCE));
7611+
if (SetSubReg)
7612+
Inst.getOperand(1).setSubReg(SrcSubReg);
7613+
7614+
Inst.addOperand(MachineOperand::CreateImm(AMDGPU::lo16));
7615+
Inst.addOperand(MachineOperand::CreateReg(Undef, 0));
7616+
Inst.addOperand(MachineOperand::CreateImm(AMDGPU::hi16));
7617+
} break;
7618+
case AMDGPU::REG_SEQUENCE: {
7619+
for (unsigned I = 0, E = (Inst.getNumOperands() - 1) / 2; I < E; ++I) {
7620+
Register SrcReg = Inst.getOperand(1 + 2 * I).getReg();
7621+
auto SubReg = Inst.getOperand(1 + 2 * I + 1).getImm();
7622+
if (SrcReg.isVirtual() && RI.isVGPR(MRI, SrcReg) &&
7623+
MRI.constrainRegClass(SrcReg, &AMDGPU::VGPR_32RegClass) &&
7624+
(SubReg == AMDGPU::lo16 || SubReg == AMDGPU::hi16)) {
7625+
Inst.getOperand(1 + 2 * I).setSubReg(AMDGPU::lo16);
7626+
}
7627+
}
7628+
} break;
7629+
}
76127630
}
76137631

76147632
void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
@@ -8129,14 +8147,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
81298147
return;
81308148
}
81318149

8132-
// If this is a v2s copy between 16bit and 32bit reg,
8133-
// replace vgpr copy to reg_sequence
8134-
if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8135-
Inst.getOperand(1).getReg().isVirtual() &&
8136-
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8137-
if (legalizeV2SCopyt16(Inst, MRI, Worklist))
8138-
return;
8139-
}
8150+
if (ST.useRealTrue16Insts())
8151+
legalizeSpecialInst_t16(Inst, MRI);
81408152

81418153
if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
81428154
NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1375,8 +1375,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
13751375
MachineRegisterInfo &MRI) const;
13761376
void legalizeOperandsVALUt16(MachineInstr &Inst, unsigned OpIdx,
13771377
MachineRegisterInfo &MRI) const;
1378-
bool legalizeV2SCopyt16(MachineInstr &Inst, MachineRegisterInfo &MRI,
1379-
SIInstrWorklist &Worklist) const;
1378+
void legalizeSpecialInst_t16(MachineInstr &Inst,
1379+
MachineRegisterInfo &MRI) const;
13801380

13811381
/// Replace the instructions opcode with the equivalent VALU
13821382
/// opcode. This function will also move the users of MachineInstruntions

llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,7 @@ body: |
138138
; GCN-LABEL: name: copy_vgpr16_sreg32_lo16_usedby_salu16
139139
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
140140
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
141-
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_16 = COPY [[DEF]].lo16
142-
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[COPY]], %subreg.lo16, [[DEF1]], %subreg.hi16
141+
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]].lo16, %subreg.lo16, [[DEF1]], %subreg.hi16
143142
; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
144143
%0:vgpr_32 = IMPLICIT_DEF
145144
%1:sreg_32 = COPY %0.lo16:vgpr_32
@@ -153,8 +152,7 @@ body: |
153152
; GCN-LABEL: name: copy_vgpr16_sreg32_hi16_usedby_salu16
154153
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
155154
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
156-
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_16 = COPY [[DEF]].hi16
157-
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[COPY]], %subreg.lo16, [[DEF1]], %subreg.hi16
155+
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]].hi16, %subreg.lo16, [[DEF1]], %subreg.hi16
158156
; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
159157
%0:vgpr_32 = IMPLICIT_DEF
160158
%1:sreg_32 = COPY %0.hi16:vgpr_32
@@ -188,21 +186,17 @@ body: |
188186
...
189187

190188
---
191-
name: S_FMAC_F16
189+
name: reg_sequence_vgpr32_sreg32
192190
body: |
193191
bb.0:
194-
; GCN-LABEL: name: S_FMAC_F16
192+
; GCN-LABEL: name: reg_sequence_vgpr32_sreg32
195193
; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
196-
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
197-
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF1]], %subreg.hi16
198-
; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
199-
; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF2]], %subreg.hi16
200-
; GCN-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_FMAC_F16_t16_e64 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
194+
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
195+
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF1]].lo16, %subreg.lo16, [[DEF]], %subreg.hi16
201196
%0:vgpr_16 = IMPLICIT_DEF
202-
%1:sgpr_lo16 = COPY %0:vgpr_16
203-
%2:sreg_32 = COPY %0:vgpr_16
204-
%3:sreg_32 = COPY %1:sgpr_lo16
205-
%4:sreg_32 = S_FMAC_F16 %3:sreg_32, %3:sreg_32, %2:sreg_32, implicit $mode
197+
%1:vgpr_32 = IMPLICIT_DEF
198+
%2:sreg_32 = COPY %1:vgpr_32
199+
%3:vgpr_32 = REG_SEQUENCE %2:sreg_32, %subreg.lo16, %0:vgpr_16, %subreg.hi16
206200
...
207201

208202
---

0 commit comments

Comments
 (0)