Skip to content

Commit 9d7e1d9

Browse files
authored
[AMDGPU][True16] added Pre-RA hint to improve copy elimination (llvm#103366)
The allocation order of 16 bit registers is vgpr0lo16, vgpr0hi16, vgpr1lo16, vgpr1hi16, vgpr2lo16.... We prefer (essentially require) that allocation order, because it uses the minimum number of registers. But when you have 16 bit data passing between 16 and 32 bit instructions you get lots of COPY. This patch teach the compiler that a COPY of a 16-bit value from a 32 bit register to a lo-half 16 bit register is free, to a hi-half 16 bit register is not. This might get improved to coalescing with additional cases, and perhaps as an alternative to the RA hints. For now upstreaming this solution first.
1 parent 5929de8 commit 9d7e1d9

26 files changed

+283
-306
lines changed

llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp

+43
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,22 @@
2222
/// although the same shall be possible with other register classes and
2323
/// instructions if necessary.
2424
///
25+
/// This pass also adds register allocation hints to COPY.
26+
/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
27+
/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
28+
/// This pass also adds register allocation hints to COPY.
29+
/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
30+
/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
31+
/// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
32+
/// the VGPR_32, the COPY can be completely eliminated.
33+
///
2534
//===----------------------------------------------------------------------===//
2635

2736
#include "GCNPreRAOptimizations.h"
2837
#include "AMDGPU.h"
2938
#include "GCNSubtarget.h"
3039
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
40+
#include "SIRegisterInfo.h"
3141
#include "llvm/CodeGen/LiveIntervals.h"
3242
#include "llvm/CodeGen/MachineFunctionPass.h"
3343
#include "llvm/InitializePasses.h"
@@ -253,5 +263,38 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
253263
Changed |= processReg(Reg);
254264
}
255265

266+
if (!ST.useRealTrue16Insts())
267+
return Changed;
268+
269+
// Add RA hints to improve True16 COPY elimination.
270+
for (const MachineBasicBlock &MBB : MF) {
271+
for (const MachineInstr &MI : MBB) {
272+
if (MI.getOpcode() != AMDGPU::COPY)
273+
continue;
274+
Register Dst = MI.getOperand(0).getReg();
275+
Register Src = MI.getOperand(1).getReg();
276+
if (Dst.isVirtual() &&
277+
MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
278+
Src.isPhysical() &&
279+
TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
280+
MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
281+
if (Src.isVirtual() &&
282+
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
283+
Dst.isPhysical() &&
284+
TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
285+
MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
286+
if (!Dst.isVirtual() || !Src.isVirtual())
287+
continue;
288+
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
289+
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
290+
MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
291+
MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
292+
}
293+
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
294+
MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
295+
MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
296+
}
297+
}
298+
256299
return Changed;
257300
}

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

+67
Original file line numberDiff line numberDiff line change
@@ -3713,6 +3713,73 @@ const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
37133713
return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
37143714
}
37153715

3716+
bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
3717+
ArrayRef<MCPhysReg> Order,
3718+
SmallVectorImpl<MCPhysReg> &Hints,
3719+
const MachineFunction &MF,
3720+
const VirtRegMap *VRM,
3721+
const LiveRegMatrix *Matrix) const {
3722+
3723+
const MachineRegisterInfo &MRI = MF.getRegInfo();
3724+
const SIRegisterInfo *TRI = ST.getRegisterInfo();
3725+
3726+
std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
3727+
3728+
switch (Hint.first) {
3729+
case AMDGPURI::Size32: {
3730+
Register Paired = Hint.second;
3731+
assert(Paired);
3732+
Register PairedPhys;
3733+
if (Paired.isPhysical()) {
3734+
PairedPhys =
3735+
getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
3736+
} else if (VRM && VRM->hasPhys(Paired)) {
3737+
PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
3738+
&AMDGPU::VGPR_32RegClass);
3739+
}
3740+
3741+
// Prefer the paired physreg.
3742+
if (PairedPhys)
3743+
// isLo(Paired) is implicitly true here from the API of
3744+
// getMatchingSuperReg.
3745+
Hints.push_back(PairedPhys);
3746+
return false;
3747+
}
3748+
case AMDGPURI::Size16: {
3749+
Register Paired = Hint.second;
3750+
assert(Paired);
3751+
Register PairedPhys;
3752+
if (Paired.isPhysical()) {
3753+
PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
3754+
} else if (VRM && VRM->hasPhys(Paired)) {
3755+
PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
3756+
}
3757+
3758+
// First prefer the paired physreg.
3759+
if (PairedPhys)
3760+
Hints.push_back(PairedPhys);
3761+
else {
3762+
// Add all the lo16 physregs.
3763+
// When the Paired operand has not yet been assigned a physreg it is
3764+
// better to try putting VirtReg in a lo16 register, because possibly
3765+
// later Paired can be assigned to the overlapping register and the COPY
3766+
// can be eliminated.
3767+
for (MCPhysReg PhysReg : Order) {
3768+
if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this))
3769+
continue;
3770+
if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
3771+
!MRI.isReserved(PhysReg))
3772+
Hints.push_back(PhysReg);
3773+
}
3774+
}
3775+
return false;
3776+
}
3777+
default:
3778+
return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
3779+
VRM);
3780+
}
3781+
}
3782+
37163783
MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
37173784
// Not a callee saved register.
37183785
return AMDGPU::SGPR30_SGPR31;

llvm/lib/Target/AMDGPU/SIRegisterInfo.h

+12
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@ class LiveRegUnits;
2929
class RegisterBank;
3030
struct SGPRSpillBuilder;
3131

32+
/// Register allocation hint types. Helps eliminate unneeded COPY with True16
33+
namespace AMDGPURI {
34+
35+
enum { Size16 = 1, Size32 = 2 };
36+
37+
} // end namespace AMDGPURI
38+
3239
class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
3340
private:
3441
const GCNSubtarget &ST;
@@ -329,6 +336,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
329336
unsigned getRegPressureSetLimit(const MachineFunction &MF,
330337
unsigned Idx) const override;
331338

339+
bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
340+
SmallVectorImpl<MCPhysReg> &Hints,
341+
const MachineFunction &MF, const VirtRegMap *VRM,
342+
const LiveRegMatrix *Matrix) const override;
343+
332344
const int *getRegUnitPressureSets(unsigned RegUnit) const override;
333345

334346
MCRegister getReturnAddressReg(const MachineFunction &MF) const;

llvm/test/CodeGen/AMDGPU/bf16.ll

+41-50
Original file line numberDiff line numberDiff line change
@@ -37712,12 +37712,10 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
3771237712
; GFX11TRUE16-LABEL: v_select_bf16:
3771337713
; GFX11TRUE16: ; %bb.0:
3771437714
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37715-
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
37716-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
37717-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
37718-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
37719-
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37720-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
37715+
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
37716+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
37717+
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37718+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
3772137719
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
3772237720
;
3772337721
; GFX11FAKE16-LABEL: v_select_bf16:
@@ -37785,14 +37783,11 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
3778537783
; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16:
3778637784
; GFX11TRUE16: ; %bb.0:
3778737785
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37788-
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
37789-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
37790-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
37791-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
37792-
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37793-
; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
37794-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
37795-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo
37786+
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
37787+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
37788+
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37789+
; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l
37790+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
3779637791
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
3779737792
;
3779837793
; GFX11FAKE16-LABEL: v_select_fneg_lhs_bf16:
@@ -37862,14 +37857,11 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
3786237857
; GFX11TRUE16-LABEL: v_select_fneg_rhs_bf16:
3786337858
; GFX11TRUE16: ; %bb.0:
3786437859
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37865-
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
37866-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
37867-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
37868-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
37869-
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37870-
; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
37871-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
37872-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
37860+
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
37861+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
37862+
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37863+
; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v2.l
37864+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
3787337865
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
3787437866
;
3787537867
; GFX11FAKE16-LABEL: v_select_fneg_rhs_bf16:
@@ -42659,17 +42651,16 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
4265942651
; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:72
4266042652
; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:4
4266142653
; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:68
42662-
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
4266342654
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
4266442655
; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
42656+
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
4266542657
; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18
4266642658
; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20
4266742659
; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22
4266842660
; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v24
4266942661
; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26
4267042662
; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28
4267142663
; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30
42672-
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v16
4267342664
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
4267442665
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
4267542666
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
@@ -42693,6 +42684,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
4269342684
; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29
4269442685
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
4269542686
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v14
42687+
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v16
4269642688
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s17, 1, v18
4269742689
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s19, 1, v20
4269842690
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s21, 1, v22
@@ -42722,45 +42714,44 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
4272242714
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s26, 1, v27
4272342715
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v29
4272442716
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32)
42725-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.l
42717+
; GFX11TRUE16-NEXT: v_and_b32_e32 v31, 1, v31
4272642718
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(31)
42727-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v32
42719+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v32
4272842720
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(30)
42729-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v33
42721+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v33
4273042722
; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v32.l, s28
4273142723
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(29)
42732-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v34
42733-
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
42724+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v34
4273442725
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(28)
42735-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v35
42726+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v35
4273642727
; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v35.l, v34.l, s27
4273742728
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(27)
42738-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v36
42729+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v36
4273942730
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26)
42740-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v37
42731+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v37
4274142732
; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v37.l, v36.l, s25
4274242733
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(25)
42743-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v38
42734+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v38
4274442735
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(24)
42745-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v39
42736+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v39
4274642737
; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v39.l, v38.l, s23
4274742738
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(23)
42748-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v48
42739+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v48
4274942740
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(22)
42750-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v49
42741+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v49
4275142742
; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v49.l, v48.l, s21
4275242743
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(21)
42753-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v50
42744+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v50
4275442745
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(20)
42755-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v51
42746+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v51
4275642747
; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v51.l, v50.l, s19
4275742748
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(19)
42758-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v52
42749+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v52
4275942750
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(18)
42760-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v53
42751+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v53
4276142752
; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v53.l, v52.l, s17
4276242753
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(17)
42763-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v54
42754+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v54
4276442755
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(16)
4276542756
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v55
4276642757
; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v55.l, v54.l, s15
@@ -42798,20 +42789,20 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
4279842789
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
4279942790
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v87
4280042791
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v87.l, v86.l, vcc_lo
42801-
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
42792+
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v31
4280242793
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v67.l, v66.l, s11
4280342794
; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v69.l, v68.l, s9
4280442795
; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v71.l, v70.l, s7
4280542796
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v81.l, v80.l, s5
4280642797
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v83.l, v82.l, s3
4280742798
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v85.l, v84.l, s1
42808-
; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v20.l, v19.l, s29
42809-
; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v22.l, v21.l, s26
42810-
; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v24.l, v23.l, s24
42811-
; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v26.l, v25.l, s22
42812-
; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v28.l, v27.l, s20
42813-
; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v30.l, v29.l, s18
42814-
; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v32.l, v31.l, s16
42799+
; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v19.l, v18.l, s29
42800+
; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v21.l, v20.l, s26
42801+
; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v23.l, v22.l, s24
42802+
; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v25.l, v24.l, s22
42803+
; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v27.l, v26.l, s20
42804+
; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v29.l, v28.l, s18
42805+
; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v32.l, v30.l, s16
4281542806
; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v34.l, v33.l, s14
4281642807
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v36.l, v35.l, s12
4281742808
; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v38.l, v37.l, s10
@@ -42820,7 +42811,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
4282042811
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v54.l, v53.l, s2
4282142812
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v52.l, v51.l, s4
4282242813
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v50.l, v49.l, s6
42823-
; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v18.l, v17.l, vcc_lo
42814+
; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v17.l, v16.l, vcc_lo
4282442815
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
4282542816
;
4282642817
; GFX11FAKE16-LABEL: v_vselect_v32bf16:

llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll

+3-5
Original file line numberDiff line numberDiff line change
@@ -908,10 +908,9 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
908908
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
909909
; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc
910910
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
911-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
912911
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
913912
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
914-
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
913+
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
915914
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
916915
;
917916
; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_other_dep:
@@ -981,12 +980,11 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
981980
; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:2 glc dlc
982981
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
983982
; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
984-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
985-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
983+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
986984
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
987985
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
988986
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
989-
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
987+
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
990988
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
991989
;
992990
; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_other_dep:

llvm/test/CodeGen/AMDGPU/fadd.f16.ll

+2-6
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,7 @@ define amdgpu_kernel void @fadd_f16(
7676
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
7777
; GFX11-SDAG-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
7878
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
79-
; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, v1.l
80-
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
81-
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
79+
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
8280
; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0
8381
; GFX11-SDAG-NEXT: s_endpgm
8482
;
@@ -98,9 +96,7 @@ define amdgpu_kernel void @fadd_f16(
9896
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
9997
; GFX11-GISEL-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
10098
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
101-
; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, v1.l
102-
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
103-
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
99+
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
104100
; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
105101
; GFX11-GISEL-NEXT: s_endpgm
106102
;

0 commit comments

Comments
 (0)