@@ -7527,6 +7527,11 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
7527
7527
return ;
7528
7528
7529
7529
unsigned Opcode = MI.getOpcode ();
7530
+ if (Opcode == AMDGPU::REG_SEQUENCE) {
7531
+ legalizeSpecialInst_t16 (MI, MRI);
7532
+ return ;
7533
+ }
7534
+
7530
7535
MachineBasicBlock *MBB = MI.getParent ();
7531
7536
// Legalize operands and check for size mismatch
7532
7537
if (!OpIdx || OpIdx >= MI.getNumExplicitOperands () ||
@@ -7565,50 +7570,63 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
7565
7570
legalizeOperandsVALUt16 (MI, OpIdx, MRI);
7566
7571
}
7567
7572
7568
- // Legalize size mismatches between 16bit and 32bit registers in v2s copy
7569
- // lowering (lower the copy itself). Including cases:
7570
- // 1. sreg32 = copy vgpr16 => vgpr32 = REG_SEQUENCE(vgpr16, lo16)
7571
- // 2. sreg32 = copy .lo16:vgpr32 / sreg32 = copy .hi16:vgpr32
7572
- // => vgpr16 = copy .hi/lo16:vgpr32
7573
- // vgpr32 = REG_SEQUENCE(vgpr16, lo16)
7573
+ // Legalize operands of size-mismatches special inst between 16bit and 32bit
7574
+ // in moveToVALU lowering in true16 mode. This caused by 16bit
7575
+ // placed in both vgpr16 and sreg32 by isel. Including cases:
7576
+ // Copy
7577
+ // 1. dst32 = copy vgpr16 => dst32 = REG_SEQUENCE(vgpr16, lo16)
7578
+ // 2. dst32 = copy .lo16:vgpr32 / dst32 = copy .hi16:vgpr32
7579
+ // => dst32 = REG_SEQUENCE(.lo16/hi16:vgpr32, lo16)
7574
7580
// 3. sgpr16 = copy vgpr32/... (skipped, isel do not generate sgpr16)
7581
+ //
7582
+ // Reg_sequence
7583
+ // dst32 = reg_sequence(vgpr32, lo16/hi16)
7584
+ // => dst32 = reg_sequence(.lo16:vgpr32, lo16/hi16)
7585
+ //
7575
7586
// This can be removed after we have sgpr16 in place.
7576
- bool SIInstrInfo::legalizeV2SCopyt16 (MachineInstr &Copy,
7577
- MachineRegisterInfo &MRI,
7578
- SIInstrWorklist &Worklist) const {
7579
- Register DstReg = Copy.getOperand (0 ).getReg ();
7580
- Register SrcReg = Copy.getOperand (1 ).getReg ();
7581
- Register SrcSubReg = Copy.getOperand (1 ).getSubReg ();
7582
- const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass (Copy);
7583
- const TargetRegisterClass *SrcRegRC = getOpRegClass (Copy, 1 );
7584
- bool KeepCopy;
7585
-
7586
- if (RI.getMatchingSuperRegClass (NewDstRC, SrcRegRC, AMDGPU::lo16)) {
7587
- KeepCopy = 0 ;
7588
- } else if (NewDstRC == &AMDGPU::VGPR_32RegClass &&
7589
- (SrcSubReg == AMDGPU::hi16 || SrcSubReg == AMDGPU::lo16)) {
7590
- KeepCopy = 1 ;
7591
- Register NewDstReg = MRI.createVirtualRegister (&AMDGPU::VGPR_16RegClass);
7592
- Copy.getOperand (0 ).setReg (NewDstReg);
7593
- SrcReg = NewDstReg;
7594
- } else
7595
- return false ;
7587
+ void SIInstrInfo::legalizeSpecialInst_t16 (MachineInstr &Inst,
7588
+ MachineRegisterInfo &MRI) const {
7589
+ unsigned Opcode = Inst.getOpcode ();
7590
+ const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass (Inst);
7591
+ switch (Opcode) {
7592
+ case AMDGPU::COPY: {
7593
+ Register SrcReg = Inst.getOperand (1 ).getReg ();
7594
+ if (!SrcReg.isVirtual () || !RI.isVGPR (MRI, SrcReg))
7595
+ return ;
7596
7596
7597
- Register NewDstReg = MRI.createVirtualRegister (NewDstRC);
7598
- Register Undef = MRI.createVirtualRegister (&AMDGPU::VGPR_16RegClass);
7599
- BuildMI (*Copy.getParent (), &Copy, Copy.getDebugLoc (),
7600
- get (AMDGPU::IMPLICIT_DEF), Undef);
7601
- BuildMI (*Copy.getParent (), std::next (Copy.getIterator ()), Copy.getDebugLoc (),
7602
- get (AMDGPU::REG_SEQUENCE), NewDstReg)
7603
- .addReg (SrcReg)
7604
- .addImm (AMDGPU::lo16)
7605
- .addReg (Undef)
7606
- .addImm (AMDGPU::hi16);
7607
- if (!KeepCopy)
7608
- Copy.eraseFromParent ();
7609
- MRI.replaceRegWith (DstReg, NewDstReg);
7610
- addUsersToMoveToVALUWorklist (NewDstReg, MRI, Worklist);
7611
- return true ;
7597
+ bool SetSubReg = false ;
7598
+ Register SrcSubReg = Inst.getOperand (1 ).getSubReg ();
7599
+ const TargetRegisterClass *SrcRegRC = getOpRegClass (Inst, 1 );
7600
+ if (RI.getMatchingSuperRegClass (NewDstRC, SrcRegRC, AMDGPU::lo16)) {
7601
+ } else if (NewDstRC == &AMDGPU::VGPR_32RegClass &&
7602
+ (SrcSubReg == AMDGPU::hi16 || SrcSubReg == AMDGPU::lo16)) {
7603
+ SetSubReg = true ;
7604
+ } else
7605
+ return ;
7606
+
7607
+ Register Undef = MRI.createVirtualRegister (&AMDGPU::VGPR_16RegClass);
7608
+ BuildMI (*Inst.getParent (), &Inst, Inst.getDebugLoc (),
7609
+ get (AMDGPU::IMPLICIT_DEF), Undef);
7610
+ Inst.setDesc (get (AMDGPU::REG_SEQUENCE));
7611
+ if (SetSubReg)
7612
+ Inst.getOperand (1 ).setSubReg (SrcSubReg);
7613
+
7614
+ Inst.addOperand (MachineOperand::CreateImm (AMDGPU::lo16));
7615
+ Inst.addOperand (MachineOperand::CreateReg (Undef, 0 ));
7616
+ Inst.addOperand (MachineOperand::CreateImm (AMDGPU::hi16));
7617
+ } break ;
7618
+ case AMDGPU::REG_SEQUENCE: {
7619
+ for (unsigned I = 0 , E = (Inst.getNumOperands () - 1 ) / 2 ; I < E; ++I) {
7620
+ Register SrcReg = Inst.getOperand (1 + 2 * I).getReg ();
7621
+ auto SubReg = Inst.getOperand (1 + 2 * I + 1 ).getImm ();
7622
+ if (SrcReg.isVirtual () && RI.isVGPR (MRI, SrcReg) &&
7623
+ MRI.constrainRegClass (SrcReg, &AMDGPU::VGPR_32RegClass) &&
7624
+ (SubReg == AMDGPU::lo16 || SubReg == AMDGPU::hi16)) {
7625
+ Inst.getOperand (1 + 2 * I).setSubReg (AMDGPU::lo16);
7626
+ }
7627
+ }
7628
+ } break ;
7629
+ }
7612
7630
}
7613
7631
7614
7632
void SIInstrInfo::moveToVALU (SIInstrWorklist &Worklist,
@@ -8129,14 +8147,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
8129
8147
return ;
8130
8148
}
8131
8149
8132
- // If this is a v2s copy between 16bit and 32bit reg,
8133
- // replace vgpr copy to reg_sequence
8134
- if (ST.useRealTrue16Insts () && Inst.isCopy () &&
8135
- Inst.getOperand (1 ).getReg ().isVirtual () &&
8136
- RI.isVGPR (MRI, Inst.getOperand (1 ).getReg ())) {
8137
- if (legalizeV2SCopyt16 (Inst, MRI, Worklist))
8138
- return ;
8139
- }
8150
+ if (ST.useRealTrue16Insts ())
8151
+ legalizeSpecialInst_t16 (Inst, MRI);
8140
8152
8141
8153
if (Inst.isCopy () && Inst.getOperand (1 ).getReg ().isVirtual () &&
8142
8154
NewDstRC == RI.getRegClassForReg (MRI, Inst.getOperand (1 ).getReg ())) {
0 commit comments