Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 67 additions & 32 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7527,6 +7527,11 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
return;

unsigned Opcode = MI.getOpcode();
if (Opcode == AMDGPU::REG_SEQUENCE) {
legalizeSpecialInst_t16(MI, MRI);
return;
}

MachineBasicBlock *MBB = MI.getParent();
// Legalize operands and check for size mismatch
if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
Expand Down Expand Up @@ -7565,6 +7570,65 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
legalizeOperandsVALUt16(MI, OpIdx, MRI);
}

// Legalize operands of size-mismatches special inst between 16bit and 32bit
// in moveToVALU lowering in true16 mode. This caused by 16bit
// placed in both vgpr16 and sreg32 by isel. Including cases:
// Copy
// 1. dst32 = copy vgpr16 => dst32 = REG_SEQUENCE(vgpr16, lo16)
// 2. dst32 = copy .lo16:vgpr32 / dst32 = copy .hi16:vgpr32
// => dst32 = REG_SEQUENCE(.lo16/hi16:vgpr32, lo16)
// 3. sgpr16 = copy vgpr32/... (skipped, isel do not generate sgpr16)
//
// Reg_sequence
// dst32 = reg_sequence(vgpr32, lo16/hi16)
// => dst32 = reg_sequence(.lo16:vgpr32, lo16/hi16)
//
// This can be removed after we have sgpr16 in place.
void SIInstrInfo::legalizeSpecialInst_t16(MachineInstr &Inst,
MachineRegisterInfo &MRI) const {
unsigned Opcode = Inst.getOpcode();
const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
switch (Opcode) {
case AMDGPU::COPY: {
Register SrcReg = Inst.getOperand(1).getReg();
if (!SrcReg.isVirtual() || !RI.isVGPR(MRI, SrcReg))
return;

bool SetSubReg = false;
Register SrcSubReg = Inst.getOperand(1).getSubReg();
const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
} else if (NewDstRC == &AMDGPU::VGPR_32RegClass &&
(SrcSubReg == AMDGPU::hi16 || SrcSubReg == AMDGPU::lo16)) {
SetSubReg = true;
} else
return;

Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
get(AMDGPU::IMPLICIT_DEF), Undef);
Inst.setDesc(get(AMDGPU::REG_SEQUENCE));
if (SetSubReg)
Inst.getOperand(1).setSubReg(SrcSubReg);

Inst.addOperand(MachineOperand::CreateImm(AMDGPU::lo16));
Inst.addOperand(MachineOperand::CreateReg(Undef, 0));
Inst.addOperand(MachineOperand::CreateImm(AMDGPU::hi16));
} break;
case AMDGPU::REG_SEQUENCE: {
for (unsigned I = 0, E = (Inst.getNumOperands() - 1) / 2; I < E; ++I) {
Register SrcReg = Inst.getOperand(1 + 2 * I).getReg();
auto SubReg = Inst.getOperand(1 + 2 * I + 1).getImm();
if (SrcReg.isVirtual() && RI.isVGPR(MRI, SrcReg) &&
MRI.constrainRegClass(SrcReg, &AMDGPU::VGPR_32RegClass) &&
(SubReg == AMDGPU::lo16 || SubReg == AMDGPU::hi16)) {
Inst.getOperand(1 + 2 * I).setSubReg(AMDGPU::lo16);
}
}
} break;
}
}

void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
MachineDominatorTree *MDT) const {

Expand Down Expand Up @@ -8083,6 +8147,9 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}

if (ST.useRealTrue16Insts())
legalizeSpecialInst_t16(Inst, MRI);

if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
// Instead of creating a copy where src and dst are the same register
Expand All @@ -8105,38 +8172,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}

// If this is a v2s copy between 16bit and 32bit reg,
// replace vgpr copy to reg_sequence/extract_subreg
// This can be remove after we have sgpr16 in place
if (ST.useRealTrue16Insts() && Inst.isCopy() &&
Inst.getOperand(1).getReg().isVirtual() &&
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
get(AMDGPU::IMPLICIT_DEF), Undef);
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
get(AMDGPU::REG_SEQUENCE), NewDstReg)
.addReg(Inst.getOperand(1).getReg())
.addImm(AMDGPU::lo16)
.addReg(Undef)
.addImm(AMDGPU::hi16);
Inst.eraseFromParent();
MRI.replaceRegWith(DstReg, NewDstReg);
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
return;
} else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
AMDGPU::lo16)) {
Inst.getOperand(1).setSubReg(AMDGPU::lo16);
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
MRI.replaceRegWith(DstReg, NewDstReg);
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
return;
}
}

Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
MRI.replaceRegWith(DstReg, NewDstReg);
legalizeOperands(Inst, MDT);
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1375,6 +1375,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
MachineRegisterInfo &MRI) const;
void legalizeOperandsVALUt16(MachineInstr &Inst, unsigned OpIdx,
MachineRegisterInfo &MRI) const;
void legalizeSpecialInst_t16(MachineInstr &Inst,
MachineRegisterInfo &MRI) const;

/// Replace the instructions opcode with the equivalent VALU
/// opcode. This function will also move the users of MachineInstruntions
Expand Down
48 changes: 14 additions & 34 deletions llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4913,12 +4913,10 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
Expand Down Expand Up @@ -8342,12 +8340,10 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
Expand Down Expand Up @@ -12629,12 +12625,10 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
Expand Down Expand Up @@ -16043,12 +16037,10 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
Expand Down Expand Up @@ -19655,12 +19647,10 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
Expand Down Expand Up @@ -23094,12 +23084,10 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
Expand Down Expand Up @@ -25911,12 +25899,10 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
Expand Down Expand Up @@ -29258,12 +29244,10 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
Expand Down Expand Up @@ -31057,12 +31041,10 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v40f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v19.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
Expand All @@ -31074,12 +31056,12 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s0, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
Expand All @@ -31103,11 +31085,11 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s29, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
Expand All @@ -31123,7 +31105,7 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s11, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s10, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s8, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, s0, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s2, 3 op_sel_hi:[1,0]
Expand Down Expand Up @@ -31168,9 +31150,9 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v33, s10
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v35, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s5
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s9
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s8
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
Expand Down Expand Up @@ -32879,12 +32861,10 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v40i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v19.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
Expand All @@ -32896,12 +32876,12 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s0, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
Expand All @@ -32925,11 +32905,11 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s29 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
Expand All @@ -32945,7 +32925,7 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s12 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s11 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s10 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s8 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, s0 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s2 op_sel_hi:[0,1]
Expand Down Expand Up @@ -32990,9 +32970,9 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v33, s10
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v35, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s5
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s9
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s8
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
Expand Down
Loading