@@ -37712,12 +37712,10 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
37712
37712
; GFX11TRUE16-LABEL: v_select_bf16:
37713
37713
; GFX11TRUE16: ; %bb.0:
37714
37714
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37715
- ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
37716
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
37717
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
37718
- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
37719
- ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37720
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
37715
+ ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
37716
+ ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
37717
+ ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37718
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
37721
37719
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
37722
37720
;
37723
37721
; GFX11FAKE16-LABEL: v_select_bf16:
@@ -37785,14 +37783,11 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
37785
37783
; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16:
37786
37784
; GFX11TRUE16: ; %bb.0:
37787
37785
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37788
- ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
37789
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
37790
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
37791
- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
37792
- ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37793
- ; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
37794
- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
37795
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo
37786
+ ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
37787
+ ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
37788
+ ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37789
+ ; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l
37790
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
37796
37791
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
37797
37792
;
37798
37793
; GFX11FAKE16-LABEL: v_select_fneg_lhs_bf16:
@@ -37862,14 +37857,11 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
37862
37857
; GFX11TRUE16-LABEL: v_select_fneg_rhs_bf16:
37863
37858
; GFX11TRUE16: ; %bb.0:
37864
37859
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37865
- ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
37866
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
37867
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
37868
- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
37869
- ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37870
- ; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
37871
- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
37872
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
37860
+ ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
37861
+ ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
37862
+ ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37863
+ ; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v2.l
37864
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
37873
37865
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
37874
37866
;
37875
37867
; GFX11FAKE16-LABEL: v_select_fneg_rhs_bf16:
@@ -42659,17 +42651,16 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
42659
42651
; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:72
42660
42652
; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:4
42661
42653
; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:68
42662
- ; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
42663
42654
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
42664
42655
; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
42656
+ ; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
42665
42657
; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18
42666
42658
; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20
42667
42659
; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22
42668
42660
; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v24
42669
42661
; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26
42670
42662
; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28
42671
42663
; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30
42672
- ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v16
42673
42664
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
42674
42665
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
42675
42666
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
@@ -42693,6 +42684,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
42693
42684
; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29
42694
42685
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
42695
42686
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v14
42687
+ ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v16
42696
42688
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s17, 1, v18
42697
42689
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s19, 1, v20
42698
42690
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s21, 1, v22
@@ -42722,45 +42714,44 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
42722
42714
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s26, 1, v27
42723
42715
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v29
42724
42716
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32)
42725
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.l
42717
+ ; GFX11TRUE16-NEXT: v_and_b32_e32 v31, 1, v31
42726
42718
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(31)
42727
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17 , 16, v32
42719
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16 , 16, v32
42728
42720
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(30)
42729
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18 , 16, v33
42721
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17 , 16, v33
42730
42722
; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v32.l, s28
42731
42723
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(29)
42732
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v34
42733
- ; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
42724
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v34
42734
42725
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(28)
42735
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20 , 16, v35
42726
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19 , 16, v35
42736
42727
; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v35.l, v34.l, s27
42737
42728
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(27)
42738
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v21 , 16, v36
42729
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20 , 16, v36
42739
42730
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26)
42740
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v22 , 16, v37
42731
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v21 , 16, v37
42741
42732
; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v37.l, v36.l, s25
42742
42733
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(25)
42743
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v23 , 16, v38
42734
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v22 , 16, v38
42744
42735
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(24)
42745
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v24 , 16, v39
42736
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v23 , 16, v39
42746
42737
; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v39.l, v38.l, s23
42747
42738
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(23)
42748
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v25 , 16, v48
42739
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v24 , 16, v48
42749
42740
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(22)
42750
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v26 , 16, v49
42741
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v25 , 16, v49
42751
42742
; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v49.l, v48.l, s21
42752
42743
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(21)
42753
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v27 , 16, v50
42744
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v26 , 16, v50
42754
42745
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(20)
42755
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28 , 16, v51
42746
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v27 , 16, v51
42756
42747
; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v51.l, v50.l, s19
42757
42748
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(19)
42758
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v29 , 16, v52
42749
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28 , 16, v52
42759
42750
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(18)
42760
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30 , 16, v53
42751
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v29 , 16, v53
42761
42752
; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v53.l, v52.l, s17
42762
42753
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(17)
42763
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v31 , 16, v54
42754
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30 , 16, v54
42764
42755
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(16)
42765
42756
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v55
42766
42757
; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v55.l, v54.l, s15
@@ -42798,20 +42789,20 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
42798
42789
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
42799
42790
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v87
42800
42791
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v87.l, v86.l, vcc_lo
42801
- ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
42792
+ ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v31
42802
42793
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v67.l, v66.l, s11
42803
42794
; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v69.l, v68.l, s9
42804
42795
; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v71.l, v70.l, s7
42805
42796
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v81.l, v80.l, s5
42806
42797
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v83.l, v82.l, s3
42807
42798
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v85.l, v84.l, s1
42808
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v20 .l, v19 .l, s29
42809
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v22 .l, v21 .l, s26
42810
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v24 .l, v23 .l, s24
42811
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v26 .l, v25 .l, s22
42812
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v28 .l, v27 .l, s20
42813
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v30 .l, v29 .l, s18
42814
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v32.l, v31 .l, s16
42799
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v19 .l, v18 .l, s29
42800
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v21 .l, v20 .l, s26
42801
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v23 .l, v22 .l, s24
42802
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v25 .l, v24 .l, s22
42803
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v27 .l, v26 .l, s20
42804
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v29 .l, v28 .l, s18
42805
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v32.l, v30 .l, s16
42815
42806
; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v34.l, v33.l, s14
42816
42807
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v36.l, v35.l, s12
42817
42808
; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v38.l, v37.l, s10
@@ -42820,7 +42811,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
42820
42811
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v54.l, v53.l, s2
42821
42812
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v52.l, v51.l, s4
42822
42813
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v50.l, v49.l, s6
42823
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v18 .l, v17 .l, vcc_lo
42814
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v17 .l, v16 .l, vcc_lo
42824
42815
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
42825
42816
;
42826
42817
; GFX11FAKE16-LABEL: v_vselect_v32bf16:
0 commit comments