From 5e40d36aefe10502c3506e51072e32a3a492208e Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Fri, 3 Mar 2023 10:40:17 -0800
Subject: [PATCH] x64: Add `shuffle` lowerings for `shufps`

This commit adds targeted lowerings for the `shuffle` instruction that
match the pattern that `shufps` supports. The `shufps` instruction
selects two elements from the first vector and two elements from the
second vector which means while it's not generally applicable it should
still be more useful than the catch-all lowering of `shuffle`.
---
 cranelift/codegen/src/isa/x64/lower.isle      | 41 ++++++---
 cranelift/codegen/src/isa/x64/lower/isle.rs   | 29 +++++++
 .../filetests/filetests/isa/x64/shuffle.clif  | 84 ++++++++++++++-----
 .../filetests/runtests/simd-shuffle.clif      | 30 +++++++
 4 files changed, 152 insertions(+), 32 deletions(-)

diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index 391003281d12..1f6250499857 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -3534,9 +3534,9 @@
 ;; selects 32-bit values from either `x` or `y`, but not both. This means
 ;; there's one rule for selecting from `x` and another rule for selecting from
 ;; `y`.
-(rule 6 (lower (shuffle x y (pshufd_lhs_imm imm)))
+(rule 8 (lower (shuffle x y (pshufd_lhs_imm imm)))
       (x64_pshufd x imm))
-(rule 5 (lower (shuffle x y (pshufd_rhs_imm imm)))
+(rule 7 (lower (shuffle x y (pshufd_rhs_imm imm)))
       (x64_pshufd y imm))
 
 (decl pshufd_lhs_imm (u8) Immediate)
@@ -3545,29 +3545,50 @@
 (extern extractor pshufd_rhs_imm pshufd_rhs_imm)
 
 ;; Special case for i8-level interleaving of upper/low bytes.
-(rule 4 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808)))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808)))
       (x64_punpckhbw a b))
-(rule 4 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000)))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000)))
       (x64_punpcklbw a b))
 
 ;; Special case for i16-level interleaving of upper/low bytes.
-(rule 4 (lower (shuffle a b (u128_from_immediate 0x1f1e_0f0e_1d1c_0d0c_1b1a_0b0a_1918_0908)))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e_0f0e_1d1c_0d0c_1b1a_0b0a_1918_0908)))
       (x64_punpckhwd a b))
-(rule 4 (lower (shuffle a b (u128_from_immediate 0x1716_0706_1514_0504_1312_0302_1110_0100)))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1716_0706_1514_0504_1312_0302_1110_0100)))
       (x64_punpcklwd a b))
 
 ;; Special case for i32-level interleaving of upper/low bytes.
-(rule 4 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_1b1a1918_0b0a0908)))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_1b1a1918_0b0a0908)))
       (x64_punpckhdq a b))
-(rule 4 (lower (shuffle a b (u128_from_immediate 0x17161514_07060504_13121110_03020100)))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x17161514_07060504_13121110_03020100)))
       (x64_punpckldq a b))
 
 ;; Special case for i64-level interleaving of upper/low bytes.
-(rule 4 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c1b1a1918_0f0e0d0c0b0a0908)))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c1b1a1918_0f0e0d0c0b0a0908)))
       (x64_punpckhqdq a b))
-(rule 4 (lower (shuffle a b (u128_from_immediate 0x1716151413121110_0706050403020100)))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1716151413121110_0706050403020100)))
       (x64_punpcklqdq a b))
 
+;; Special case for the `shufps` instruction which will select two 32-bit values
+;; from the first operand and two 32-bit values from the second operand. Note
+;; that there is a second case here as well for when the operands can be
+;; swapped.
+;;
+;; Note that the priority of this instruction is currently lower than the above
+;; special cases since `shufps` handles many of them and for now it's
+;; hypothesized that the dedicated instructions are better than `shufps`.
+;; Someone with more knowledge about x86 timings should perhaps reorder the
+;; rules here eventually though.
+(rule 5 (lower (shuffle x y (shufps_imm imm)))
+      (x64_shufps x y imm))
+(rule 4 (lower (shuffle x y (shufps_rev_imm imm)))
+      (x64_shufps y x imm))
+
+(decl shufps_imm(u8) Immediate)
+(extern extractor shufps_imm shufps_imm)
+(decl shufps_rev_imm(u8) Immediate)
+(extern extractor shufps_rev_imm shufps_rev_imm)
+
+
 ;; If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
 ;; register. We statically build `constructed_mask` to zero out any unknown lane
 ;; indices (may not be completely necessary: verification could fail incorrect
diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs
index 5101e06edf42..b6d5364f2e90 100644
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -1024,6 +1024,35 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
             None
         }
     }
+
+    fn shufps_imm(&mut self, imm: Immediate) -> Option<u8> {
+        // The `shufps` instruction selects the first two elements from the
+        // first vector and the second two elements from the second vector, so
+        // offset the third/fourth selectors by 4 and then make sure everything
+        // fits in 32-bits
+        let (a, b, c, d) = self.shuffle32_from_imm(imm)?;
+        let c = c.checked_sub(4)?;
+        let d = d.checked_sub(4)?;
+        if a < 4 && b < 4 && c < 4 && d < 4 {
+            Some(a | (b << 2) | (c << 4) | (d << 6))
+        } else {
+            None
+        }
+    }
+
+    fn shufps_rev_imm(&mut self, imm: Immediate) -> Option<u8> {
+        // This is almost the same as `shufps_imm` except the elements that are
+        // subtracted are reversed. This handles the case that `shufps`
+        // instruction can be emitted if the order of the operands are swapped.
+        let (a, b, c, d) = self.shuffle32_from_imm(imm)?;
+        let a = a.checked_sub(4)?;
+        let b = b.checked_sub(4)?;
+        if a < 4 && b < 4 && c < 4 && d < 4 {
+            Some(a | (b << 2) | (c << 4) | (d << 6))
+        } else {
+            None
+        }
+    }
 }
 
 impl IsleContext<'_, '_, MInst, X64Backend> {
diff --git a/cranelift/filetests/filetests/isa/x64/shuffle.clif b/cranelift/filetests/filetests/isa/x64/shuffle.clif
index b0e4827e2c95..9d919823e9bd 100644
--- a/cranelift/filetests/filetests/isa/x64/shuffle.clif
+++ b/cranelift/filetests/filetests/isa/x64/shuffle.clif
@@ -205,10 +205,7 @@ block0(v0: i32x4, v1: i32x4):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   pshufb  %xmm0, const(0), %xmm0
-;   movdqa  %xmm1, %xmm5
-;   pshufb  %xmm5, const(1), %xmm5
-;   por     %xmm0, %xmm5, %xmm0
+;   shufps  $78, %xmm0, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -218,27 +215,10 @@ block0(v0: i32x4, v1: i32x4):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   pshufb 0x23(%rip), %xmm0
-;   movdqa %xmm1, %xmm5
-;   pshufb 0x26(%rip), %xmm5
-;   por %xmm5, %xmm0
+;   shufps $0x4e, %xmm1, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %cl, (%rax)
-;   orl %ecx, (%rdx)
-;   orl -0x7f7ff0f2(, %rcx), %ecx
-;   addb $0x80, -0x7f7f7f80(%rax)
-;   addb $0x80, -0x7f7f7f80(%rax)
-;   addb %al, (%rcx)
-;   addb (%rbx), %al
-;   addb $5, %al
 
 function %punpckldq(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
@@ -352,3 +332,63 @@ block0(v0: i64x2, v1: i64x2):
 ;   popq %rbp
 ;   retq
 
+function %shufps_3277(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [12 13 14 15 8 9 10 11 28 29 30 31 28 29 30 31]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   shufps  $251, %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   shufps $0xfb, %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %shufps_6500(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [24 25 26 27 20 21 22 23 0 1 2 3 0 1 2 3]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm4
+;   movdqa  %xmm1, %xmm0
+;   shufps  $6, %xmm0, %xmm4, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movdqa %xmm0, %xmm4
+;   movdqa %xmm1, %xmm0
+;   shufps $6, %xmm4, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/runtests/simd-shuffle.clif b/cranelift/filetests/filetests/runtests/simd-shuffle.clif
index ec1a5f851239..09243c9e8955 100644
--- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif
+++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif
@@ -140,3 +140,33 @@ block0(v0: i64x2, v1: i64x2):
     return v5
 }
 ; run: %punpckhqdq([1 2], [5 6]) == [2 6]
+
+function %shufps_0145(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [1 2 5 6]
+
+function %shufps_3277(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [12 13 14 15 8 9 10 11 28 29 30 31 28 29 30 31]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [4 3 8 8]
+
+function %shufps_6500(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [24 25 26 27 20 21 22 23 0 1 2 3 0 1 2 3]
+    v5 = bitcast.i32x4 little v4
+    return v5
+}
+; run: %shufps_0145([1 2 3 4], [5 6 7 8]) == [7 6 1 1]