From 157e9e8ac75b11036981d5e873dfd7321e340473 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Fri, 3 Mar 2023 19:24:52 -0800
Subject: [PATCH] x64: Specialize `shuffle` with an all-zeros immediate

Instead of loading the all-zeros immediate from a rip-relative address
at the end of the function instead generate a zero with a `pxor`
instruction and then use `pshufb` to do the broadcast.
---
 cranelift/codegen/src/isa/x64/lower.isle      |  8 ++++++
 .../filetests/filetests/isa/x64/shuffle.clif  | 27 +++++++++++++++++++
 .../filetests/runtests/simd-shuffle.clif      |  7 +++++
 3 files changed, 42 insertions(+)

diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index 4173702a70c8..651f315aafe9 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -3571,6 +3571,14 @@
 (rule 6 (lower (shuffle a b (u128_from_immediate 0x1716151413121110_0706050403020100)))
       (x64_punpcklqdq a b))
 
+;; If the vector shift mask is all 0s then that means the first byte of the
+;; first operand is broadcast to all bytes. Falling through would load an
+;; all-zeros constant from a rip-relative location but it should be slightly
+;; more efficient to execute the `pshufb` here-and-now with an xor'd-to-be-zero
+;; register.
+(rule 6 (lower (shuffle a _ (u128_from_immediate 0)))
+      (x64_pshufb a (xmm_zero $I8X16)))
+
 ;; Special case for the `shufps` instruction which will select two 32-bit values
 ;; from the first operand and two 32-bit values from the second operand. Note
 ;; that there is a second case here as well for when the operands can be
diff --git a/cranelift/filetests/filetests/isa/x64/shuffle.clif b/cranelift/filetests/filetests/isa/x64/shuffle.clif
index 6de850a16de0..b056d9f1686c 100644
--- a/cranelift/filetests/filetests/isa/x64/shuffle.clif
+++ b/cranelift/filetests/filetests/isa/x64/shuffle.clif
@@ -616,3 +616,30 @@ block0(v0: i16x8, v1: i16x8):
 ;   popq %rbp
 ;   retq
 
+function %shuffle_all_zeros(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pxor    %xmm3, %xmm3, %xmm3
+;   pshufb  %xmm0, %xmm3, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pxor %xmm3, %xmm3
+;   pshufb %xmm3, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/runtests/simd-shuffle.clif b/cranelift/filetests/filetests/runtests/simd-shuffle.clif
index 184bce5c4e7f..b9913f443d4b 100644
--- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif
+++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif
@@ -244,3 +244,10 @@ block0(v0: i16x8, v1: i16x8):
     return v5
 }
 ; run: %pshufhw_rhs_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 10 11 12 16 14 16 14]
+
+function %shuffle_all_zeros(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+    return v2
+}
+; run: %shuffle_all_zeros([5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]) == [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]