Skip to content

Commit 1807334

Browse files
committed
addtional case for mismatched size copy
1 parent 1ad31d9 commit 1807334

9 files changed

+255
-620
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 55 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -7565,6 +7565,52 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
75657565
legalizeOperandsVALUt16(MI, OpIdx, MRI);
75667566
}
75677567

7568+
// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7569+
// lowering (lower the copy itself). Including cases:
7570+
// 1. sreg32 = copy vgpr16 => vgpr32 = REG_SEQUENCE(vgpr16, lo16)
7571+
// 2. sreg32 = copy .lo16:vgpr32 / sreg32 = copy .hi16:vgpr32
7572+
// => vgpr16 = copy .hi/lo16:vgpr32
7573+
// vgpr32 = REG_SEQUENCE(vgpr16, lo16)
7574+
// 3. sgpr16 = copy vgpr32/... (skipped, isel do not generate sgpr16)
7575+
// This can be removed after we have sgpr16 in place.
7576+
bool SIInstrInfo::legalizeV2SCopyt16(MachineInstr &Copy,
7577+
MachineRegisterInfo &MRI,
7578+
SIInstrWorklist &Worklist) const {
7579+
Register DstReg = Copy.getOperand(0).getReg();
7580+
Register SrcReg = Copy.getOperand(1).getReg();
7581+
Register SrcSubReg = Copy.getOperand(1).getSubReg();
7582+
const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Copy);
7583+
const TargetRegisterClass *SrcRegRC = getOpRegClass(Copy, 1);
7584+
bool KeepCopy;
7585+
7586+
if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
7587+
KeepCopy = 0;
7588+
} else if (NewDstRC == &AMDGPU::VGPR_32RegClass &&
7589+
(SrcSubReg == AMDGPU::hi16 || SrcSubReg == AMDGPU::lo16)) {
7590+
KeepCopy = 1;
7591+
Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7592+
Copy.getOperand(0).setReg(NewDstReg);
7593+
SrcReg = NewDstReg;
7594+
} else
7595+
return false;
7596+
7597+
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7598+
Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7599+
BuildMI(*Copy.getParent(), &Copy, Copy.getDebugLoc(),
7600+
get(AMDGPU::IMPLICIT_DEF), Undef);
7601+
BuildMI(*Copy.getParent(), std::next(Copy.getIterator()), Copy.getDebugLoc(),
7602+
get(AMDGPU::REG_SEQUENCE), NewDstReg)
7603+
.addReg(SrcReg)
7604+
.addImm(AMDGPU::lo16)
7605+
.addReg(Undef)
7606+
.addImm(AMDGPU::hi16);
7607+
if (!KeepCopy)
7608+
Copy.eraseFromParent();
7609+
MRI.replaceRegWith(DstReg, NewDstReg);
7610+
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7611+
return true;
7612+
}
7613+
75687614
void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
75697615
MachineDominatorTree *MDT) const {
75707616

@@ -8083,6 +8129,15 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
80838129
return;
80848130
}
80858131

8132+
// If this is a v2s copy between 16bit and 32bit reg,
8133+
// replace vgpr copy to reg_sequence
8134+
if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8135+
Inst.getOperand(1).getReg().isVirtual() &&
8136+
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8137+
if (legalizeV2SCopyt16(Inst, MRI, Worklist))
8138+
return;
8139+
}
8140+
80868141
if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
80878142
NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
80888143
// Instead of creating a copy where src and dst are the same register
@@ -8105,38 +8160,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
81058160
return;
81068161
}
81078162

8108-
// If this is a v2s copy between 16bit and 32bit reg,
8109-
// replace vgpr copy to reg_sequence/extract_subreg
8110-
// This can be remove after we have sgpr16 in place
8111-
if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8112-
Inst.getOperand(1).getReg().isVirtual() &&
8113-
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8114-
const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8115-
if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8116-
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8117-
Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8118-
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8119-
get(AMDGPU::IMPLICIT_DEF), Undef);
8120-
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8121-
get(AMDGPU::REG_SEQUENCE), NewDstReg)
8122-
.addReg(Inst.getOperand(1).getReg())
8123-
.addImm(AMDGPU::lo16)
8124-
.addReg(Undef)
8125-
.addImm(AMDGPU::hi16);
8126-
Inst.eraseFromParent();
8127-
MRI.replaceRegWith(DstReg, NewDstReg);
8128-
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8129-
return;
8130-
} else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8131-
AMDGPU::lo16)) {
8132-
Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8133-
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8134-
MRI.replaceRegWith(DstReg, NewDstReg);
8135-
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8136-
return;
8137-
}
8138-
}
8139-
81408163
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
81418164
MRI.replaceRegWith(DstReg, NewDstReg);
81428165
legalizeOperands(Inst, MDT);

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1375,6 +1375,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
13751375
MachineRegisterInfo &MRI) const;
13761376
void legalizeOperandsVALUt16(MachineInstr &Inst, unsigned OpIdx,
13771377
MachineRegisterInfo &MRI) const;
1378+
bool legalizeV2SCopyt16(MachineInstr &Inst, MachineRegisterInfo &MRI,
1379+
SIInstrWorklist &Worklist) const;
13781380

13791381
/// Replace the instructions opcode with the equivalent VALU
13801382
/// opcode. This function will also move the users of MachineInstruntions

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll

Lines changed: 14 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -4913,12 +4913,10 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
49134913
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20i32_scalar:
49144914
; GFX11-TRUE16: ; %bb.0:
49154915
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4916-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
49174916
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
49184917
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
49194918
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
49204919
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
4921-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
49224920
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
49234921
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
49244922
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -8342,12 +8340,10 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
83428340
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20i32_scalar:
83438341
; GFX11-TRUE16: ; %bb.0:
83448342
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8345-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
83468343
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
83478344
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
83488345
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
83498346
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
8350-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
83518347
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
83528348
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
83538349
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -12629,12 +12625,10 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
1262912625
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20f32_scalar:
1263012626
; GFX11-TRUE16: ; %bb.0:
1263112627
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12632-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
1263312628
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
1263412629
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
1263512630
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
1263612631
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
12637-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
1263812632
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
1263912633
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
1264012634
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -16043,12 +16037,10 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
1604316037
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20f32_scalar:
1604416038
; GFX11-TRUE16: ; %bb.0:
1604516039
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16046-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
1604716040
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
1604816041
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
1604916042
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
1605016043
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
16051-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
1605216044
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
1605316045
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
1605416046
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -19655,12 +19647,10 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
1965519647
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10i64_scalar:
1965619648
; GFX11-TRUE16: ; %bb.0:
1965719649
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19658-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
1965919650
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
1966019651
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
1966119652
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
1966219653
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
19663-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
1966419654
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
1966519655
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
1966619656
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -23094,12 +23084,10 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
2309423084
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10i64_scalar:
2309523085
; GFX11-TRUE16: ; %bb.0:
2309623086
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23097-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
2309823087
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
2309923088
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
2310023089
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
2310123090
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
23102-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
2310323091
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
2310423092
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
2310523093
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -25911,12 +25899,10 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
2591125899
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10f64_scalar:
2591225900
; GFX11-TRUE16: ; %bb.0:
2591325901
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25914-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
2591525902
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
2591625903
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
2591725904
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
2591825905
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
25919-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
2592025906
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
2592125907
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
2592225908
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -29258,12 +29244,10 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
2925829244
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10f64_scalar:
2925929245
; GFX11-TRUE16: ; %bb.0:
2926029246
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29261-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
2926229247
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
2926329248
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
2926429249
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
2926529250
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
29266-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
2926729251
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
2926829252
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
2926929253
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -31057,12 +31041,10 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
3105731041
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v40f16_scalar:
3105831042
; GFX11-TRUE16: ; %bb.0:
3105931043
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31060-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, 0
3106131044
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
3106231045
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
3106331046
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
3106431047
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
31065-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v19.h
3106631048
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
3106731049
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
3106831050
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
@@ -31074,12 +31056,12 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
3107431056
; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
3107531057
; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16
3107631058
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16
31077-
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
31059+
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16
3107831060
; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
3107931061
; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
3108031062
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
3108131063
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
31082-
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s0, 16
31064+
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 16
3108331065
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
3108431066
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
3108531067
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
@@ -31103,11 +31085,11 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
3110331085
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
3110431086
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
3110531087
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
31106-
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
31088+
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
3110731089
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
3110831090
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
3110931091
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
31110-
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9
31092+
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8
3111131093
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
3111231094
; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s29, 3 op_sel_hi:[1,0]
3111331095
; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
@@ -31123,7 +31105,7 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
3112331105
; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0]
3112431106
; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s11, 3 op_sel_hi:[1,0]
3112531107
; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s10, 3 op_sel_hi:[1,0]
31126-
; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s8, 3 op_sel_hi:[1,0]
31108+
; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
3112731109
; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, s0, 3 op_sel_hi:[1,0]
3112831110
; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s1, 3 op_sel_hi:[1,0]
3112931111
; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s2, 3 op_sel_hi:[1,0]
@@ -31168,9 +31150,9 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
3116831150
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14
3116931151
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12
3117031152
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v33, s10
31171-
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s7
31153+
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v35, s7
3117231154
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s5
31173-
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s9
31155+
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s8
3117431156
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
3117531157
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
3117631158
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
@@ -32879,12 +32861,10 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
3287932861
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v40i16_scalar:
3288032862
; GFX11-TRUE16: ; %bb.0:
3288132863
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32882-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, 0
3288332864
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
3288432865
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
3288532866
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
3288632867
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
32887-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v19.h
3288832868
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
3288932869
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
3289032870
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
@@ -32896,12 +32876,12 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
3289632876
; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
3289732877
; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16
3289832878
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16
32899-
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
32879+
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16
3290032880
; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
3290132881
; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
3290232882
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
3290332883
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
32904-
; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s0, 16
32884+
; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 16
3290532885
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
3290632886
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
3290732887
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
@@ -32925,11 +32905,11 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
3292532905
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
3292632906
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
3292732907
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
32928-
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
32908+
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
3292932909
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
3293032910
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
3293132911
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
32932-
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9
32912+
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8
3293332913
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
3293432914
; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s29 op_sel_hi:[0,1]
3293532915
; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
@@ -32945,7 +32925,7 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
3294532925
; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s12 op_sel_hi:[0,1]
3294632926
; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s11 op_sel_hi:[0,1]
3294732927
; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s10 op_sel_hi:[0,1]
32948-
; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s8 op_sel_hi:[0,1]
32928+
; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
3294932929
; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, s0 op_sel_hi:[0,1]
3295032930
; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s1 op_sel_hi:[0,1]
3295132931
; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s2 op_sel_hi:[0,1]
@@ -32990,9 +32970,9 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
3299032970
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14
3299132971
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12
3299232972
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v33, s10
32993-
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s7
32973+
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v35, s7
3299432974
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s5
32995-
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s9
32975+
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s8
3299632976
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
3299732977
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
3299832978
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2

0 commit comments

Comments
 (0)