From 157e9e8ac75b11036981d5e873dfd7321e340473 Mon Sep 17 00:00:00 2001 From: Alex Crichton <alex@alexcrichton.com> Date: Fri, 3 Mar 2023 19:24:52 -0800 Subject: [PATCH] x64: Specialize `shuffle` with an all-zeros immediate Instead of loading the all-zeros immediate from a rip-relative address at the end of the function instead generate a zero with a `pxor` instruction and then use `pshufb` to do the broadcast. --- cranelift/codegen/src/isa/x64/lower.isle | 8 ++++++ .../filetests/filetests/isa/x64/shuffle.clif | 27 +++++++++++++++++++ .../filetests/runtests/simd-shuffle.clif | 7 +++++ 3 files changed, 42 insertions(+) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 4173702a70c8..651f315aafe9 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3571,6 +3571,14 @@ (rule 6 (lower (shuffle a b (u128_from_immediate 0x1716151413121110_0706050403020100))) (x64_punpcklqdq a b)) +;; If the vector shift mask is all 0s then that means the first byte of the +;; first operand is broadcast to all bytes. Falling through would load an +;; all-zeros constant from a rip-relative location but it should be slightly +;; more efficient to execute the `pshufb` here-and-now with an xor'd-to-be-zero +;; register. +(rule 6 (lower (shuffle a _ (u128_from_immediate 0))) + (x64_pshufb a (xmm_zero $I8X16))) + ;; Special case for the `shufps` instruction which will select two 32-bit values ;; from the first operand and two 32-bit values from the second operand. Note ;; that there is a second case here as well for when the operands can be diff --git a/cranelift/filetests/filetests/isa/x64/shuffle.clif b/cranelift/filetests/filetests/isa/x64/shuffle.clif index 6de850a16de0..b056d9f1686c 100644 --- a/cranelift/filetests/filetests/isa/x64/shuffle.clif +++ b/cranelift/filetests/filetests/isa/x64/shuffle.clif @@ -616,3 +616,30 @@ block0(v0: i16x8, v1: i16x8): ; popq %rbp ; retq +function %shuffle_all_zeros(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pxor %xmm3, %xmm3, %xmm3 +; pshufb %xmm0, %xmm3, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pxor %xmm3, %xmm3 +; pshufb %xmm3, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/runtests/simd-shuffle.clif b/cranelift/filetests/filetests/runtests/simd-shuffle.clif index 184bce5c4e7f..b9913f443d4b 100644 --- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif +++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif @@ -244,3 +244,10 @@ block0(v0: i16x8, v1: i16x8): return v5 } ; run: %pshufhw_rhs_3131([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 10 11 12 16 14 16 14] + +function %shuffle_all_zeros(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + return v2 +} +; run: %shuffle_all_zeros([5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]) == [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5]