Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1088,8 +1088,7 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
"We do not expect to see 16-bit copies from VGPR to SGPR unless "
"we have 16-bit VGPRs");
assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass ||
MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass ||
assert(MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass ||
MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass);
// There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits
MRI->setRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
Expand Down
13 changes: 10 additions & 3 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7261,7 +7261,8 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
MachineBasicBlock *MBB = MI.getParent();
// Legalize operands and check for size mismatch
if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
OpIdx >= get(Opcode).getNumOperands())
OpIdx >= get(Opcode).getNumOperands() ||
get(Opcode).operands()[OpIdx].RegClass == -1)
return;

MachineOperand &Op = MI.getOperand(OpIdx);
Expand Down Expand Up @@ -7820,15 +7821,21 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
// that copies will end up as machine instructions and not be
// eliminated.
addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
MRI.clearKillFlags(Inst.getOperand(1).getReg());
Register NewDstReg = Inst.getOperand(1).getReg();
MRI.replaceRegWith(DstReg, NewDstReg);
MRI.clearKillFlags(NewDstReg);
Inst.getOperand(0).setReg(DstReg);
// Make sure we don't leave around a dead VGPR->SGPR copy. Normally
// these are deleted later, but at -O0 it would leave a suspicious
// looking illegal copy of an undef register.
for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
Inst.removeOperand(I);
Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
// Legalize t16 operand since replaceReg is called after addUsersToVALU
for (MachineOperand &MO :
make_early_inc_range(MRI.use_operands(NewDstReg))) {
legalizeOperandsVALUt16(*MO.getParent(), MRI);
}
return;
}

Expand Down
4 changes: 1 addition & 3 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3557,9 +3557,7 @@ SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {

const TargetRegisterClass *
SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
if (BitWidth == 16)
return &AMDGPU::SGPR_LO16RegClass;
if (BitWidth == 32)
if (BitWidth == 16 || BitWidth == 32)
return &AMDGPU::SReg_32RegClass;
if (BitWidth == 64)
return &AMDGPU::SReg_64RegClass;
Expand Down
23 changes: 20 additions & 3 deletions llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1534,17 +1534,34 @@ def : GCNPat<
>;

def : GCNPat<
(i64 (anyext i16:$src)),
(i64 (UniformUnaryFrag<anyext> i16:$src)),
(REG_SEQUENCE VReg_64,
(i32 (COPY $src)), sub0,
(V_MOV_B32_e32 (i32 0)), sub1)
Comment on lines +1538 to +1540
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why VReg_64 and V_MOV_B32? I would expect SReg_64 and S_MOV_B32.

Stepping back a bit, I do not understand exactly how these patterns help. UniformUnaryFrag is not enough to ensure that the input is in an SGPR, since sometimes you get uniform values in VGPRs.

Copy link
Contributor Author

@broxigarchen broxigarchen Jun 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I copied what we have in the code base from Non-true16 flow to here (see line 1443). I am not sure why this is using VReg_64 either hmmm, looks like this works for both sgpr and vgpr. Let me try SReg_64 and see if this is better

These patterns did not solve the issue, but I think it helps a little as it generate less number of mismatched copy.

Copy link
Contributor Author

@broxigarchen broxigarchen Jun 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems using SReg_64 here introduces some unrelated changes in the test (and the change seems questionable). I would rather not do it in this patch

>;

def : GCNPat<
(i64 (DivergentUnaryFrag<anyext> i16:$src)),
(REG_SEQUENCE VReg_64, $src, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
>;

def : GCNPat<
(i16 (trunc i32:$src)),
(i16 (UniformUnaryFrag<trunc> i32:$src)),
(COPY $src)
>;

def : GCNPat<
(i16 (DivergentUnaryFrag<trunc> i32:$src)),
(EXTRACT_SUBREG $src, lo16)
>;

def : GCNPat <
(i16 (trunc i64:$src)),
(i16 (UniformUnaryFrag<trunc> i64:$src)),
(EXTRACT_SUBREG $src, sub0)
>;

def : GCNPat <
(i16 (DivergentUnaryFrag<trunc> i64:$src)),
(EXTRACT_SUBREG $src, lo16)
>;

Expand Down
45,159 changes: 16,460 additions & 28,699 deletions llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll

Large diffs are not rendered by default.

Loading
Loading