x64: Add non-SSE4.1 lowerings of pmov{s,z}x* (#6279)

* x64: Add non-SSE4.1 lowerings of `pmov{s,z}x*` This commit adds lowerings for a suite of sign/zero extension instructions which don't require SSE4.1. Like before these lowerings are based on LLVM's output. This commit also deletes special casees for `i16x8.extmul_{low,high}_*` since the output of the special case is the same as the default lowering of all the component instructions used within as well. * Remove SSE4.1 specialization of `uwiden_high` LLVM prefers the `punpckh*`-based lowerings and at least according to `llvm-mca` these are slightly better cycle-wise too.
bytecodealliance · Apr 27, 2023 · edae6c0 · edae6c0
1 parent 57dabd3
commit edae6c0
Show file tree

Hide file tree

Showing 13 changed files with 209 additions and 159 deletions.
diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
@@ -908,6 +908,7 @@
             Ucomiss
             Ucomisd
             Unpcklps
+            Unpckhps
             Xorps
             Xorpd
             Phaddw
@@ -1183,6 +1184,7 @@
             Vpunpckhwd
             Vpunpcklwd
             Vunpcklps
+            Vunpckhps
             Vandnps
             Vandnpd
             Vpandn
@@ -2901,6 +2903,14 @@
       (if-let $true (use_avx_simd))
       (xmm_rmir_vex (AvxOpcode.Vunpcklps) src1 src2))
 
+;; Helper for creating `unpckhps` instructions.
+(decl x64_unpckhps (Xmm XmmMem) Xmm)
+(rule 0 (x64_unpckhps src1 src2)
+      (xmm_rm_r (SseOpcode.Unpckhps) src1 src2))
+(rule 1 (x64_unpckhps src1 src2)
+      (if-let $true (use_avx_simd))
+      (xmm_rmir_vex (AvxOpcode.Vunpckhps) src1 src2))
+
 ;; Helper for creating `andnps` instructions.
 (decl x64_andnps (Xmm XmmMem) Xmm)
 (rule 0 (x64_andnps src1 src2)
@@ -4908,6 +4918,7 @@
 (convert Xmm XmmMemAligned xmm_to_xmm_mem_aligned)
 (convert XmmMem XmmMemImm xmm_mem_to_xmm_mem_imm)
 (convert XmmMem RegMem xmm_mem_to_reg_mem)
+(convert RegMemImm XmmMemImm xmm_mem_imm_new)
 (convert WritableXmm Xmm writable_xmm_to_xmm)
 (convert WritableXmm WritableReg writable_xmm_to_reg)
 (convert WritableXmm Reg writable_xmm_to_r_reg)

diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -1116,6 +1116,7 @@ pub enum SseOpcode {
     Ucomiss,
     Ucomisd,
     Unpcklps,
+    Unpckhps,
     Xorps,
     Xorpd,
     Phaddw,
@@ -1168,6 +1169,7 @@ impl SseOpcode {
             | SseOpcode::Subss
             | SseOpcode::Ucomiss
             | SseOpcode::Unpcklps
+            | SseOpcode::Unpckhps
             | SseOpcode::Xorps => SSE,
 
             SseOpcode::Addpd
@@ -1516,6 +1518,7 @@ impl fmt::Debug for SseOpcode {
             SseOpcode::Ucomiss => "ucomiss",
             SseOpcode::Ucomisd => "ucomisd",
             SseOpcode::Unpcklps => "unpcklps",
+            SseOpcode::Unpckhps => "unpckhps",
             SseOpcode::Xorps => "xorps",
             SseOpcode::Xorpd => "xorpd",
             SseOpcode::Phaddw => "phaddw",
@@ -1611,6 +1614,7 @@ impl AvxOpcode {
             | AvxOpcode::Vpunpckhwd
             | AvxOpcode::Vpunpcklwd
             | AvxOpcode::Vunpcklps
+            | AvxOpcode::Vunpckhps
             | AvxOpcode::Vaddps
             | AvxOpcode::Vaddpd
             | AvxOpcode::Vsubps

diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -2060,6 +2060,7 @@ pub(crate) fn emit(
                 SseOpcode::Subss => (LegacyPrefixes::_F3, 0x0F5C, 2),
                 SseOpcode::Subsd => (LegacyPrefixes::_F2, 0x0F5C, 2),
                 SseOpcode::Unpcklps => (LegacyPrefixes::None, 0x0F14, 2),
+                SseOpcode::Unpckhps => (LegacyPrefixes::None, 0x0F15, 2),
                 SseOpcode::Xorps => (LegacyPrefixes::None, 0x0F57, 2),
                 SseOpcode::Xorpd => (LegacyPrefixes::_66, 0x0F57, 2),
                 SseOpcode::Phaddw => (LegacyPrefixes::_66, 0x0F3801, 3),
@@ -2206,6 +2207,7 @@ pub(crate) fn emit(
                 AvxOpcode::Vpunpckhwd => (LP::_66, OM::_0F, 0x69),
                 AvxOpcode::Vpunpcklwd => (LP::_66, OM::_0F, 0x61),
                 AvxOpcode::Vunpcklps => (LP::None, OM::_0F, 0x14),
+                AvxOpcode::Vunpckhps => (LP::None, OM::_0F, 0x15),
                 AvxOpcode::Vaddps => (LP::None, OM::_0F, 0x58),
                 AvxOpcode::Vaddpd => (LP::_66, OM::_0F, 0x58),
                 AvxOpcode::Vsubps => (LP::None, OM::_0F, 0x5C),

diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
@@ -982,20 +982,6 @@
         ;; al_bl + aa_bb_shifted
         (x64_paddq al_bl aa_bb_shifted)))
 
-;; Special case for `i16x8.extmul_high_i8x16_s`.
-(rule 1 (lower (has_type (multi_lane 16 8)
-                       (imul (swiden_high (and (value_type (multi_lane 8 16))
-                                               x))
-                             (swiden_high (and (value_type (multi_lane 8 16))
-                                               y)))))
-      (let ((x1 Xmm x)
-            (x2 Xmm (x64_palignr x1 x1 8))
-            (x3 Xmm (x64_pmovsxbw x2))
-            (y1 Xmm y)
-            (y2 Xmm (x64_palignr y1 y1 8))
-            (y3 Xmm (x64_pmovsxbw y2)))
-        (x64_pmullw x3 y3)))
-
 ;; Special case for `i32x4.extmul_high_i16x8_s`.
 (rule 1 (lower (has_type (multi_lane 32 4)
                        (imul (swiden_high (and (value_type (multi_lane 16 8))
@@ -1019,16 +1005,6 @@
             (y2 Xmm (x64_pshufd y 0xFA)))
         (x64_pmuldq x2 y2)))
 
-;; Special case for `i16x8.extmul_low_i8x16_s`.
-(rule 1 (lower (has_type (multi_lane 16 8)
-                       (imul (swiden_low (and (value_type (multi_lane 8 16))
-                                              x))
-                             (swiden_low (and (value_type (multi_lane 8 16))
-                                              y)))))
-      (let ((x2 Xmm (x64_pmovsxbw x))
-            (y2 Xmm (x64_pmovsxbw y)))
-        (x64_pmullw x2 y2)))
-
 ;; Special case for `i32x4.extmul_low_i16x8_s`.
 (rule 1 (lower (has_type (multi_lane 32 4)
                        (imul (swiden_low (and (value_type (multi_lane 16 8))
@@ -1052,20 +1028,6 @@
             (y2 Xmm (x64_pshufd y 0x50)))
         (x64_pmuldq x2 y2)))
 
-;; Special case for `i16x8.extmul_high_i8x16_u`.
-(rule 1 (lower (has_type (multi_lane 16 8)
-                       (imul (uwiden_high (and (value_type (multi_lane 8 16))
-                                               x))
-                             (uwiden_high (and (value_type (multi_lane 8 16))
-                                               y)))))
-      (let ((x1 Xmm x)
-            (x2 Xmm (x64_palignr x1 x1 8))
-            (x3 Xmm (x64_pmovzxbw x2))
-            (y1 Xmm y)
-            (y2 Xmm (x64_palignr y1 y1 8))
-            (y3 Xmm (x64_pmovzxbw y2)))
-        (x64_pmullw x3 y3)))
-
 ;; Special case for `i32x4.extmul_high_i16x8_u`.
 (rule 1 (lower (has_type (multi_lane 32 4)
                        (imul (uwiden_high (and (value_type (multi_lane 16 8))
@@ -1088,16 +1050,6 @@
             (y2 Xmm (x64_pshufd y 0xFA)))
         (x64_pmuludq x2 y2)))
 
-;; Special case for `i16x8.extmul_low_i8x16_u`.
-(rule 1 (lower (has_type (multi_lane 16 8)
-                       (imul (uwiden_low (and (value_type (multi_lane 8 16))
-                                              x))
-                             (uwiden_low (and (value_type (multi_lane 8 16))
-                                              y)))))
-      (let ((x2 Xmm (x64_pmovzxbw x))
-            (y2 Xmm (x64_pmovzxbw y)))
-        (x64_pmullw x2 y2)))
-
 ;; Special case for `i32x4.extmul_low_i16x8_u`.
 (rule 1 (lower (has_type (multi_lane 32 4)
                        (imul (uwiden_low (and (value_type (multi_lane 16 8))
@@ -2559,18 +2511,37 @@
 
 ;; We also include widening vector loads; these sign- or zero-extend each lane
 ;; to the next wider width (e.g., 16x4 -> 32x4).
+(rule 1 (lower (has_type $I16X8 (sload8x8 flags address offset)))
+        (if-let $true (use_sse41))
+        (x64_pmovsxbw (to_amode flags address offset)))
+(rule 1 (lower (has_type $I16X8 (uload8x8 flags address offset)))
+        (if-let $true (use_sse41))
+        (x64_pmovzxbw (to_amode flags address offset)))
+(rule 1 (lower (has_type $I32X4 (sload16x4 flags address offset)))
+        (if-let $true (use_sse41))
+        (x64_pmovsxwd (to_amode flags address offset)))
+(rule 1 (lower (has_type $I32X4 (uload16x4 flags address offset)))
+        (if-let $true (use_sse41))
+        (x64_pmovzxwd (to_amode flags address offset)))
+(rule 1 (lower (has_type $I64X2 (sload32x2 flags address offset)))
+        (if-let $true (use_sse41))
+        (x64_pmovsxdq (to_amode flags address offset)))
+(rule 1 (lower (has_type $I64X2 (uload32x2 flags address offset)))
+        (if-let $true (use_sse41))
+        (x64_pmovzxdq (to_amode flags address offset)))
+
 (rule (lower (has_type $I16X8 (sload8x8 flags address offset)))
-      (x64_pmovsxbw (to_amode flags address offset)))
+      (lower_swiden_low $I16X8 (x64_movq_to_xmm (to_amode flags address offset))))
 (rule (lower (has_type $I16X8 (uload8x8 flags address offset)))
-      (x64_pmovzxbw (to_amode flags address offset)))
+      (lower_uwiden_low $I16X8 (x64_movq_to_xmm (to_amode flags address offset))))
 (rule (lower (has_type $I32X4 (sload16x4 flags address offset)))
-      (x64_pmovsxwd (to_amode flags address offset)))
+      (lower_swiden_low $I32X4 (x64_movq_to_xmm (to_amode flags address offset))))
 (rule (lower (has_type $I32X4 (uload16x4 flags address offset)))
-      (x64_pmovzxwd (to_amode flags address offset)))
+      (lower_uwiden_low $I32X4 (x64_movq_to_xmm (to_amode flags address offset))))
 (rule (lower (has_type $I64X2 (sload32x2 flags address offset)))
-      (x64_pmovsxdq (to_amode flags address offset)))
+      (lower_swiden_low $I64X2 (x64_movq_to_xmm (to_amode flags address offset))))
 (rule (lower (has_type $I64X2 (uload32x2 flags address offset)))
-      (x64_pmovzxdq (to_amode flags address offset)))
+      (lower_uwiden_low $I64X2 (x64_movq_to_xmm (to_amode flags address offset))))
 
 ;; Rules for `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -3266,51 +3237,101 @@
 
 ;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(rule (lower (has_type $I16X8 (swiden_low val @ (value_type $I8X16))))
-      (x64_pmovsxbw val))
+;; With SSE4.1 use the `pmovsx*` instructions for this
+(rule 1 (lower (has_type $I16X8 (swiden_low val @ (value_type $I8X16))))
+        (if-let $true (use_sse41))
+        (x64_pmovsxbw val))
+(rule 1 (lower (has_type $I32X4 (swiden_low val @ (value_type $I16X8))))
+        (if-let $true (use_sse41))
+        (x64_pmovsxwd val))
+(rule 1 (lower (has_type $I64X2 (swiden_low val @ (value_type $I32X4))))
+        (if-let $true (use_sse41))
+        (x64_pmovsxdq val))
+
+(rule (lower (has_type ty (swiden_low val))) (lower_swiden_low ty val))
 
-(rule (lower (has_type $I32X4 (swiden_low val @ (value_type $I16X8))))
-      (x64_pmovsxwd val))
+(decl lower_swiden_low (Type Xmm) Xmm)
 
-(rule (lower (has_type $I64X2 (swiden_low val @ (value_type $I32X4))))
-      (x64_pmovsxdq val))
+;; Duplicate the low lanes next to each other, then perform a wider shift-right
+;; by the low lane width to move the upper of each pair back into the lower lane
+;; of each pair, achieving the widening of the lower lanes.
+(rule (lower_swiden_low $I16X8 val)
+      (x64_psraw (x64_punpcklbw val val) (xmi_imm 8)))
+(rule (lower_swiden_low $I32X4 val)
+      (x64_psrad (x64_punpcklwd val val) (xmi_imm 16)))
+
+;; Generate the sign-extended halves with a `val < 0` comparison (expressed
+;; reversed here), then interleave the low 32-bit halves to create the full
+;; 64-bit results.
+(rule (lower_swiden_low $I64X2 val)
+      (let ((tmp Xmm (x64_pcmpgtd (xmm_zero $I32X4) val)))
+      (x64_punpckldq val tmp)))
 
 ;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(rule (lower (has_type $I16X8 (swiden_high val @ (value_type $I8X16))))
-      (let ((x Xmm val))
-        (x64_pmovsxbw (x64_palignr x x 8))))
+;; Similar to `swiden_low` with SSE4.1 except that the upper lanes are moved
+;; to the lower lanes first.
+(rule 1 (lower (has_type $I16X8 (swiden_high val @ (value_type $I8X16))))
+        (if-let $true (use_sse41))
+        (let ((x Xmm val))
+          (x64_pmovsxbw (x64_palignr x x 8))))
+(rule 1 (lower (has_type $I32X4 (swiden_high val @ (value_type $I16X8))))
+        (if-let $true (use_sse41))
+        (let ((x Xmm val))
+          (x64_pmovsxwd (x64_palignr x x 8))))
+(rule 1 (lower (has_type $I64X2 (swiden_high val @ (value_type $I32X4))))
+        (if-let $true (use_sse41))
+        (x64_pmovsxdq (x64_pshufd val 0b11_10_11_10)))
 
+;; Similar to `swiden_low` versions but using `punpckh*` instructions to
+;; pair the high lanes next to each other.
+(rule (lower (has_type $I16X8 (swiden_high val @ (value_type $I8X16))))
+      (let ((val Xmm val))
+        (x64_psraw (x64_punpckhbw val val) (xmi_imm 8))))
 (rule (lower (has_type $I32X4 (swiden_high val @ (value_type $I16X8))))
-      (let ((x Xmm val))
-        (x64_pmovsxwd (x64_palignr x x 8))))
+      (let ((val Xmm val))
+        (x64_psrad (x64_punpckhwd val val) (xmi_imm 16))))
 
+;; Same as `swiden_low`, but `val` has its high lanes moved down.
 (rule (lower (has_type $I64X2 (swiden_high val @ (value_type $I32X4))))
-      (x64_pmovsxdq (x64_pshufd val 0xEE)))
+      (let ((val Xmm (x64_pshufd val 0b00_00_11_10))
+            (tmp Xmm (x64_pcmpgtd (xmm_zero $I32X4) val)))
+      (x64_punpckldq val tmp)))
 
 ;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(rule (lower (has_type $I16X8 (uwiden_low val @ (value_type $I8X16))))
-      (x64_pmovzxbw val))
+;; With SSE4.1 use the `pmovzx*` instructions for this
+(rule 1 (lower (has_type $I16X8 (uwiden_low val @ (value_type $I8X16))))
+        (if-let $true (use_sse41))
+        (x64_pmovzxbw val))
+(rule 1 (lower (has_type $I32X4 (uwiden_low val @ (value_type $I16X8))))
+        (if-let $true (use_sse41))
+        (x64_pmovzxwd val))
+(rule 1 (lower (has_type $I64X2 (uwiden_low val @ (value_type $I32X4))))
+        (if-let $true (use_sse41))
+        (x64_pmovzxdq val))
 
-(rule (lower (has_type $I32X4 (uwiden_low val @ (value_type $I16X8))))
-      (x64_pmovzxwd val))
+(rule (lower (has_type ty (uwiden_low val))) (lower_uwiden_low ty val))
 
-(rule (lower (has_type $I64X2 (uwiden_low val @ (value_type $I32X4))))
-      (x64_pmovzxdq val))
+;; Interleave an all-zero register with the low lanes to produce zero-extended
+;; results.
+(decl lower_uwiden_low (Type Xmm) Xmm)
+(rule (lower_uwiden_low $I16X8 val) (x64_punpcklbw val (xmm_zero $I8X16)))
+(rule (lower_uwiden_low $I32X4 val) (x64_punpcklwd val (xmm_zero $I8X16)))
+(rule (lower_uwiden_low $I64X2 val) (x64_unpcklps val (xmm_zero $F32X4)))
 
 ;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+;; Same as `uwiden_high`, but interleaving high lanes instead.
+;;
+;; Note that according to `llvm-mca` at least these instructions are faster
+;; than using `pmovzx*` in terms of cycles, even if SSE4.1 is available.
 (rule (lower (has_type $I16X8 (uwiden_high val @ (value_type $I8X16))))
-      (let ((x Xmm val))
-        (x64_pmovzxbw (x64_palignr x x 8))))
-
+      (x64_punpckhbw val (xmm_zero $I8X16)))
 (rule (lower (has_type $I32X4 (uwiden_high val @ (value_type $I16X8))))
-      (let ((x Xmm val))
-        (x64_pmovzxwd (x64_palignr x x 8))))
-
+      (x64_punpckhwd val (xmm_zero $I8X16)))
 (rule (lower (has_type $I64X2 (uwiden_high val @ (value_type $I32X4))))
-      (x64_pmovzxdq (x64_pshufd val 0xEE)))
+      (x64_unpckhps val (xmm_zero $F32X4)))
 
 ;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 

diff --git a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif
@@ -1204,8 +1204,9 @@ block0(v0: i8x16):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   vpalignr $8, %xmm0, %xmm0, %xmm2
-;   vpmovzxbw %xmm2, %xmm0
+;   uninit  %xmm2
+;   vpxor   %xmm2, %xmm2, %xmm4
+;   vpunpckhbw %xmm0, %xmm4, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -1215,8 +1216,8 @@ block0(v0: i8x16):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   vpalignr $8, %xmm0, %xmm0, %xmm2
-;   vpmovzxbw %xmm2, %xmm0
+;   vpxor %xmm2, %xmm2, %xmm4
+;   vpunpckhbw %xmm4, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq