diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 6fc57dec6a826..71c720ed09b5f 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3516,30 +3516,6 @@ bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { OpType <= AMDGPU::OPERAND_SRC_LAST; } -bool SIRegisterInfo::shouldRewriteCopySrc( - const TargetRegisterClass *DefRC, - unsigned DefSubReg, - const TargetRegisterClass *SrcRC, - unsigned SrcSubReg) const { - // We want to prefer the smallest register class possible, so we don't want to - // stop and rewrite on anything that looks like a subregister - // extract. Operations mostly don't care about the super register class, so we - // only want to stop on the most basic of copies between the same register - // class. - // - // e.g. if we have something like - // %0 = ... - // %1 = ... - // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 - // %3 = COPY %2, sub0 - // - // We want to look through the COPY to find: - // => %3 = COPY %0 - - // Plain copy. - return getCommonSubClass(DefRC, SrcRC) != nullptr; -} - bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { // TODO: 64-bit operands have extending behavior from 32-bit literal. return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST && diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 8e481e3ac2304..a434efb70d052 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -275,11 +275,6 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { const TargetRegisterClass *SubRC, unsigned SubIdx) const; - bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, - unsigned DefSubReg, - const TargetRegisterClass *SrcRC, - unsigned SrcSubReg) const override; - /// \returns True if operands defined with this operand type can accept /// a literal constant (i.e. any 32-bit immediate). bool opCanUseLiteralConstant(unsigned OpType) const; diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll index c6c0b9cf8f027..cc2f775ff22bc 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -163,33 +163,33 @@ define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspa define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_x4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x11 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b32 s18, s2 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s19, s3 -; SI-NEXT: s_mov_b32 s22, s2 -; SI-NEXT: s_mov_b32 s23, s3 -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s16, s8 -; SI-NEXT: s_mov_b32 s17, s9 -; SI-NEXT: s_mov_b32 s20, s10 -; SI-NEXT: s_mov_b32 s21, s11 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s22, s10 +; SI-NEXT: s_mov_b32 s23, s11 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s20, s6 +; SI-NEXT: s_mov_b32 s21, s7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll index 3504546801c93..2258f6a7b5483 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll @@ -334,58 +334,58 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %ctpop_arg, i32 %cond) { ; SI-LABEL: ctpop_i64_in_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s8, s[4:5], 0xf +; SI-NEXT: s_load_dword s6, s[4:5], 0xf ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2 +; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x2 ; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc ; SI-NEXT: s_cbranch_vccnz .LBB7_3 ; SI-NEXT: .LBB7_2: ; %if -; SI-NEXT: s_bcnt1_i32_b64 s4, s[6:7] -; SI-NEXT: s_mov_b32 s5, 0 +; SI-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; SI-NEXT: s_mov_b32 s7, 0 ; SI-NEXT: .LBB7_3: ; %endif -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB7_4: -; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: ; implicit-def: $sgpr6_sgpr7 ; SI-NEXT: s_branch .LBB7_2 ; ; VI-LABEL: ctpop_i64_in_br: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s8, s[4:5], 0x3c +; VI-NEXT: s_load_dword s6, s[4:5], 0x3c ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s8, 0 +; VI-NEXT: s_cmp_lg_u32 s6, 0 ; VI-NEXT: s_cbranch_scc0 .LBB7_4 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x8 ; VI-NEXT: s_cbranch_execnz .LBB7_3 ; VI-NEXT: .LBB7_2: ; %if ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bcnt1_i32_b64 s4, s[6:7] -; VI-NEXT: s_mov_b32 s5, 0 +; VI-NEXT: s_bcnt1_i32_b64 s6, s[4:5] +; VI-NEXT: s_mov_b32 s7, 0 ; VI-NEXT: .LBB7_3: ; %endif ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB7_4: -; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; VI-NEXT: ; implicit-def: $sgpr6_sgpr7 ; VI-NEXT: s_branch .LBB7_2 entry: %tmp0 = icmp eq i32 %cond, 0 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll index cd85c301e16d5..b443e654350c5 100644 --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -151,20 +151,20 @@ entry: define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MulMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, s3 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -174,8 +174,8 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1, ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v2 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_MulMul: @@ -1698,20 +1698,20 @@ entry: define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MultipleUses_add1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, s3 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -1719,10 +1719,10 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_MultipleUses_add1: @@ -1851,20 +1851,20 @@ entry: define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MultipleUses_add1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, s3 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 @@ -1872,10 +1872,10 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s4 +; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0 ; GFX7-NEXT: v_mad_i32_i24 v1, v3, v1, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot2_MultipleUses_add1: @@ -2004,20 +2004,20 @@ entry: define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MultipleUses_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, s3 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -2025,10 +2025,10 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v4, v0, v2, s4 +; GFX7-NEXT: v_mad_u32_u24 v4, v0, v2, s0 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_MultipleUses_mul1: @@ -2163,20 +2163,20 @@ entry: define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MultipleUses_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, s3 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 @@ -2184,10 +2184,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v4, v3, v1, s4 +; GFX7-NEXT: v_mad_i32_i24 v4, v3, v1, s0 ; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot2_MultipleUses_mul1: @@ -2322,31 +2322,31 @@ entry: define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX7-LABEL: udot2_MultipleUses_mul2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, s3 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s4 +; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_MultipleUses_mul2: @@ -2479,20 +2479,20 @@ entry: define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX7-LABEL: idot2_MultipleUses_mul2: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, s3 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 @@ -2500,10 +2500,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v4, v0, v2, s4 +; GFX7-NEXT: v_mad_i32_i24 v4, v0, v2, s0 ; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot2_MultipleUses_mul2: diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index 0573de4a7f2d1..fe693b4af67f3 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -2373,21 +2373,21 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 @@ -2406,10 +2406,10 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v12 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v12 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v13 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v9 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v8 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 @@ -2418,51 +2418,50 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v15 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v14 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v14 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v15 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v10 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v10 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v11 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v11 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v10 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v9 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v8 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v10 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v11 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v12 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v12 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v13 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v14 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v15 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[12:15] -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(5) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v6 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v6 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v7 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4) +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v1 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v2 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s1 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[8:11] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s0 -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(6) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v1 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v1 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v3 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v5 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v7 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v6 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v6 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v7 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[8:11] -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[4:7] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] ; GCNX3-HSA-NEXT: s_endpgm ; ; GCNX3-NOHSA-LABEL: global_sextload_v16i32_to_v16i64: diff --git a/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir b/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir index d32163bb69235..057769372c041 100644 --- a/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir @@ -14,7 +14,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]] %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -60,7 +60,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub1, [[COPY]].sub0, %subreg.sub0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]] %0:vreg_64_align2 = COPY $vgpr0_vgpr1 %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -106,7 +106,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[COPY]].sub1, %subreg.sub2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]] %0:vreg_64 = COPY $vgpr1_vgpr2 %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -129,7 +129,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[COPY]].sub1, %subreg.sub2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]] %0:vreg_64 = COPY $vgpr1_vgpr2 %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir b/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir index c5e51aa1b4fe8..e1ff42125ce9a 100644 --- a/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir +++ b/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir @@ -23,8 +23,8 @@ body: | ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[REG_SEQUENCE]].sub1, %subreg.sub0, [[REG_SEQUENCE]].sub0, %subreg.sub1 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 ; GCN-NEXT: KILL [[COPY3]], implicit [[COPY2]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index e2bcf3f6a2e2c..3a872a6080952 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -9779,111 +9779,118 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 8, v0 ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240 -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:224 -; GFX6-NEXT: s_mov_b32 s2, 0x86a00 -; GFX6-NEXT: s_mov_b64 s[8:9], exec -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:208 ; GFX6-NEXT: s_mov_b32 s2, 0x86600 +; GFX6-NEXT: s_mov_b64 s[8:9], exec ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:192 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224 ; GFX6-NEXT: s_mov_b32 s2, 0x86200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:176 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208 ; GFX6-NEXT: s_mov_b32 s2, 0x85e00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:160 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192 ; GFX6-NEXT: s_mov_b32 s2, 0x85a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:144 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176 ; GFX6-NEXT: s_mov_b32 s2, 0x85600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:128 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160 ; GFX6-NEXT: s_mov_b32 s2, 0x85200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:112 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144 ; GFX6-NEXT: s_mov_b32 s2, 0x84e00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:96 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128 ; GFX6-NEXT: s_mov_b32 s2, 0x84a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:80 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112 ; GFX6-NEXT: s_mov_b32 s2, 0x84600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:64 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96 ; GFX6-NEXT: s_mov_b32 s2, 0x84200 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80 +; GFX6-NEXT: s_mov_b32 s2, 0x83e00 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:64 +; GFX6-NEXT: s_mov_b32 s2, 0x83a00 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16 +; GFX6-NEXT: s_mov_b32 s2, 0x83200 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[8:11], v[5:6], s[4:7], 0 addr64 -; GFX6-NEXT: buffer_load_dwordx4 v[12:15], v[5:6], s[4:7], 0 addr64 offset:16 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:32 ; GFX6-NEXT: s_mov_b32 s2, 0x83600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[12:15], v[5:6], s[4:7], 0 addr64 offset:32 -; GFX6-NEXT: s_mov_b32 s2, 0x83a00 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 @@ -9898,16 +9905,17 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[8:9] -; GFX6-NEXT: buffer_load_dwordx4 v[12:15], v[5:6], s[4:7], 0 addr64 offset:48 -; GFX6-NEXT: s_mov_b32 s0, 0x83e00 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v8 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:48 +; GFX6-NEXT: s_mov_b32 s0, 0x86a00 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, 1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s0 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(3) +; GFX6-NEXT: v_mov_b32_e32 v7, 1 ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: buffer_store_dword v7, v4, s[40:43], 0 offen ; GFX6-NEXT: ;;#ASMSTART @@ -9930,7 +9938,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[0:1] -; GFX6-NEXT: s_mov_b32 s6, 0x83200 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s[8:15] ; GFX6-NEXT: ;;#ASMEND @@ -9949,11 +9957,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s33 ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s6 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s6 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s6 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v11, off, s[40:43], s6 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX6-NEXT: s_mov_b64 vcc, s[6:7] ; GFX6-NEXT: s_cbranch_execz .LBB1_2 @@ -10184,127 +10187,126 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[4:5] -; GFX6-NEXT: s_mov_b64 s[38:39], s[2:3] -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:240 -; GFX6-NEXT: s_mov_b32 s0, 0x86a00 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x86600 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:224 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b64 s[38:39], s[2:3] ; GFX6-NEXT: s_mov_b32 s0, 0x86200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:208 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x85e00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:192 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x85a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:176 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x85600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:160 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x85200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:144 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x84e00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:128 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x84a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:112 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x84600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:96 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x84200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:80 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x83e00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:64 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x83a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:80 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s0, 0x86a00 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x83600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x83200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:16 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 ; GFX6-NEXT: s_endpgm ; @@ -10322,59 +10324,60 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:224 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:224 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:208 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2030 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[19:22], v5, s[38:39] offset:192 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:192 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[15:18], v5, s[38:39] offset:176 -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:160 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:160 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:144 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:128 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[19:22], v5, s[38:39] ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:112 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:112 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 1 -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[7:10], v5, s[38:39] ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:96 +; GFX9-FLATSCR-NEXT: v_lshl_add_u32 v4, v19, 13, v4 +; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:96 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) -; GFX9-FLATSCR-NEXT: v_lshl_add_u32 v4, v7, 13, v4 -; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX9-FLATSCR-NEXT: scratch_store_dword v4, v6, off -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:80 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:80 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:64 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:64 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:48 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:48 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:32 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:32 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:16 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:16 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: s_nop 0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 1 +; GFX9-FLATSCR-NEXT: scratch_store_dword v4, v6, off ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ; def s[0:7] ; GFX9-FLATSCR-NEXT: ;;#ASMEND @@ -10403,27 +10406,26 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39] ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20e0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v11 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20f0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v13 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v14 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_nop 0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v19 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v20 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v22 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ;;#ASMEND -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20e0 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v3 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, v1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, v0 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v0 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: ;;#ASMSTART @@ -10439,53 +10441,53 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: .LBB1_2: ; %ret ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:112 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:112 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:96 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:96 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:80 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:80 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:64 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:64 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:48 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:48 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:32 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:16 -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[7:10], s[36:37] +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:32 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:240 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:16 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37] +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2030 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:224 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37] offset:240 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:224 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:208 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:208 -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37] offset:192 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:192 ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[15:18], s[36:37] offset:176 -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:160 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[11:14], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:160 ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:144 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)