diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index b816b163a0a3..075bda1c0868 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -908,6 +908,7 @@ Ucomiss Ucomisd Unpcklps + Unpckhps Xorps Xorpd Phaddw @@ -1183,6 +1184,7 @@ Vpunpckhwd Vpunpcklwd Vunpcklps + Vunpckhps Vandnps Vandnpd Vpandn @@ -2901,6 +2903,14 @@ (if-let $true (use_avx_simd)) (xmm_rmir_vex (AvxOpcode.Vunpcklps) src1 src2)) +;; Helper for creating `unpckhps` instructions. +(decl x64_unpckhps (Xmm XmmMem) Xmm) +(rule 0 (x64_unpckhps src1 src2) + (xmm_rm_r (SseOpcode.Unpckhps) src1 src2)) +(rule 1 (x64_unpckhps src1 src2) + (if-let $true (use_avx_simd)) + (xmm_rmir_vex (AvxOpcode.Vunpckhps) src1 src2)) + ;; Helper for creating `andnps` instructions. (decl x64_andnps (Xmm XmmMem) Xmm) (rule 0 (x64_andnps src1 src2) @@ -4908,6 +4918,7 @@ (convert Xmm XmmMemAligned xmm_to_xmm_mem_aligned) (convert XmmMem XmmMemImm xmm_mem_to_xmm_mem_imm) (convert XmmMem RegMem xmm_mem_to_reg_mem) +(convert RegMemImm XmmMemImm xmm_mem_imm_new) (convert WritableXmm Xmm writable_xmm_to_xmm) (convert WritableXmm WritableReg writable_xmm_to_reg) (convert WritableXmm Reg writable_xmm_to_r_reg) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index b2301f729724..eb54103da1f8 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -1116,6 +1116,7 @@ pub enum SseOpcode { Ucomiss, Ucomisd, Unpcklps, + Unpckhps, Xorps, Xorpd, Phaddw, @@ -1168,6 +1169,7 @@ impl SseOpcode { | SseOpcode::Subss | SseOpcode::Ucomiss | SseOpcode::Unpcklps + | SseOpcode::Unpckhps | SseOpcode::Xorps => SSE, SseOpcode::Addpd @@ -1516,6 +1518,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Ucomiss => "ucomiss", SseOpcode::Ucomisd => "ucomisd", SseOpcode::Unpcklps => "unpcklps", + SseOpcode::Unpckhps => "unpckhps", SseOpcode::Xorps => "xorps", SseOpcode::Xorpd => "xorpd", SseOpcode::Phaddw => "phaddw", @@ -1611,6 +1614,7 @@ impl AvxOpcode { | AvxOpcode::Vpunpckhwd | AvxOpcode::Vpunpcklwd | AvxOpcode::Vunpcklps + | AvxOpcode::Vunpckhps | AvxOpcode::Vaddps | AvxOpcode::Vaddpd | AvxOpcode::Vsubps diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 53c4b6808b41..a2a08c677f38 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -2060,6 +2060,7 @@ pub(crate) fn emit( SseOpcode::Subss => (LegacyPrefixes::_F3, 0x0F5C, 2), SseOpcode::Subsd => (LegacyPrefixes::_F2, 0x0F5C, 2), SseOpcode::Unpcklps => (LegacyPrefixes::None, 0x0F14, 2), + SseOpcode::Unpckhps => (LegacyPrefixes::None, 0x0F15, 2), SseOpcode::Xorps => (LegacyPrefixes::None, 0x0F57, 2), SseOpcode::Xorpd => (LegacyPrefixes::_66, 0x0F57, 2), SseOpcode::Phaddw => (LegacyPrefixes::_66, 0x0F3801, 3), @@ -2206,6 +2207,7 @@ pub(crate) fn emit( AvxOpcode::Vpunpckhwd => (LP::_66, OM::_0F, 0x69), AvxOpcode::Vpunpcklwd => (LP::_66, OM::_0F, 0x61), AvxOpcode::Vunpcklps => (LP::None, OM::_0F, 0x14), + AvxOpcode::Vunpckhps => (LP::None, OM::_0F, 0x15), AvxOpcode::Vaddps => (LP::None, OM::_0F, 0x58), AvxOpcode::Vaddpd => (LP::_66, OM::_0F, 0x58), AvxOpcode::Vsubps => (LP::None, OM::_0F, 0x5C), diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 7a2b76ceb917..ff6585ef874d 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -982,20 +982,6 @@ ;; al_bl + aa_bb_shifted (x64_paddq al_bl aa_bb_shifted))) -;; Special case for `i16x8.extmul_high_i8x16_s`. -(rule 1 (lower (has_type (multi_lane 16 8) - (imul (swiden_high (and (value_type (multi_lane 8 16)) - x)) - (swiden_high (and (value_type (multi_lane 8 16)) - y))))) - (let ((x1 Xmm x) - (x2 Xmm (x64_palignr x1 x1 8)) - (x3 Xmm (x64_pmovsxbw x2)) - (y1 Xmm y) - (y2 Xmm (x64_palignr y1 y1 8)) - (y3 Xmm (x64_pmovsxbw y2))) - (x64_pmullw x3 y3))) - ;; Special case for `i32x4.extmul_high_i16x8_s`. (rule 1 (lower (has_type (multi_lane 32 4) (imul (swiden_high (and (value_type (multi_lane 16 8)) @@ -1019,16 +1005,6 @@ (y2 Xmm (x64_pshufd y 0xFA))) (x64_pmuldq x2 y2))) -;; Special case for `i16x8.extmul_low_i8x16_s`. -(rule 1 (lower (has_type (multi_lane 16 8) - (imul (swiden_low (and (value_type (multi_lane 8 16)) - x)) - (swiden_low (and (value_type (multi_lane 8 16)) - y))))) - (let ((x2 Xmm (x64_pmovsxbw x)) - (y2 Xmm (x64_pmovsxbw y))) - (x64_pmullw x2 y2))) - ;; Special case for `i32x4.extmul_low_i16x8_s`. (rule 1 (lower (has_type (multi_lane 32 4) (imul (swiden_low (and (value_type (multi_lane 16 8)) @@ -1052,20 +1028,6 @@ (y2 Xmm (x64_pshufd y 0x50))) (x64_pmuldq x2 y2))) -;; Special case for `i16x8.extmul_high_i8x16_u`. -(rule 1 (lower (has_type (multi_lane 16 8) - (imul (uwiden_high (and (value_type (multi_lane 8 16)) - x)) - (uwiden_high (and (value_type (multi_lane 8 16)) - y))))) - (let ((x1 Xmm x) - (x2 Xmm (x64_palignr x1 x1 8)) - (x3 Xmm (x64_pmovzxbw x2)) - (y1 Xmm y) - (y2 Xmm (x64_palignr y1 y1 8)) - (y3 Xmm (x64_pmovzxbw y2))) - (x64_pmullw x3 y3))) - ;; Special case for `i32x4.extmul_high_i16x8_u`. (rule 1 (lower (has_type (multi_lane 32 4) (imul (uwiden_high (and (value_type (multi_lane 16 8)) @@ -1088,16 +1050,6 @@ (y2 Xmm (x64_pshufd y 0xFA))) (x64_pmuludq x2 y2))) -;; Special case for `i16x8.extmul_low_i8x16_u`. -(rule 1 (lower (has_type (multi_lane 16 8) - (imul (uwiden_low (and (value_type (multi_lane 8 16)) - x)) - (uwiden_low (and (value_type (multi_lane 8 16)) - y))))) - (let ((x2 Xmm (x64_pmovzxbw x)) - (y2 Xmm (x64_pmovzxbw y))) - (x64_pmullw x2 y2))) - ;; Special case for `i32x4.extmul_low_i16x8_u`. (rule 1 (lower (has_type (multi_lane 32 4) (imul (uwiden_low (and (value_type (multi_lane 16 8)) @@ -2559,18 +2511,37 @@ ;; We also include widening vector loads; these sign- or zero-extend each lane ;; to the next wider width (e.g., 16x4 -> 32x4). +(rule 1 (lower (has_type $I16X8 (sload8x8 flags address offset))) + (if-let $true (use_sse41)) + (x64_pmovsxbw (to_amode flags address offset))) +(rule 1 (lower (has_type $I16X8 (uload8x8 flags address offset))) + (if-let $true (use_sse41)) + (x64_pmovzxbw (to_amode flags address offset))) +(rule 1 (lower (has_type $I32X4 (sload16x4 flags address offset))) + (if-let $true (use_sse41)) + (x64_pmovsxwd (to_amode flags address offset))) +(rule 1 (lower (has_type $I32X4 (uload16x4 flags address offset))) + (if-let $true (use_sse41)) + (x64_pmovzxwd (to_amode flags address offset))) +(rule 1 (lower (has_type $I64X2 (sload32x2 flags address offset))) + (if-let $true (use_sse41)) + (x64_pmovsxdq (to_amode flags address offset))) +(rule 1 (lower (has_type $I64X2 (uload32x2 flags address offset))) + (if-let $true (use_sse41)) + (x64_pmovzxdq (to_amode flags address offset))) + (rule (lower (has_type $I16X8 (sload8x8 flags address offset))) - (x64_pmovsxbw (to_amode flags address offset))) + (lower_swiden_low $I16X8 (x64_movq_to_xmm (to_amode flags address offset)))) (rule (lower (has_type $I16X8 (uload8x8 flags address offset))) - (x64_pmovzxbw (to_amode flags address offset))) + (lower_uwiden_low $I16X8 (x64_movq_to_xmm (to_amode flags address offset)))) (rule (lower (has_type $I32X4 (sload16x4 flags address offset))) - (x64_pmovsxwd (to_amode flags address offset))) + (lower_swiden_low $I32X4 (x64_movq_to_xmm (to_amode flags address offset)))) (rule (lower (has_type $I32X4 (uload16x4 flags address offset))) - (x64_pmovzxwd (to_amode flags address offset))) + (lower_uwiden_low $I32X4 (x64_movq_to_xmm (to_amode flags address offset)))) (rule (lower (has_type $I64X2 (sload32x2 flags address offset))) - (x64_pmovsxdq (to_amode flags address offset))) + (lower_swiden_low $I64X2 (x64_movq_to_xmm (to_amode flags address offset)))) (rule (lower (has_type $I64X2 (uload32x2 flags address offset))) - (x64_pmovzxdq (to_amode flags address offset))) + (lower_uwiden_low $I64X2 (x64_movq_to_xmm (to_amode flags address offset)))) ;; Rules for `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3266,51 +3237,101 @@ ;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule (lower (has_type $I16X8 (swiden_low val @ (value_type $I8X16)))) - (x64_pmovsxbw val)) +;; With SSE4.1 use the `pmovsx*` instructions for this +(rule 1 (lower (has_type $I16X8 (swiden_low val @ (value_type $I8X16)))) + (if-let $true (use_sse41)) + (x64_pmovsxbw val)) +(rule 1 (lower (has_type $I32X4 (swiden_low val @ (value_type $I16X8)))) + (if-let $true (use_sse41)) + (x64_pmovsxwd val)) +(rule 1 (lower (has_type $I64X2 (swiden_low val @ (value_type $I32X4)))) + (if-let $true (use_sse41)) + (x64_pmovsxdq val)) + +(rule (lower (has_type ty (swiden_low val))) (lower_swiden_low ty val)) -(rule (lower (has_type $I32X4 (swiden_low val @ (value_type $I16X8)))) - (x64_pmovsxwd val)) +(decl lower_swiden_low (Type Xmm) Xmm) -(rule (lower (has_type $I64X2 (swiden_low val @ (value_type $I32X4)))) - (x64_pmovsxdq val)) +;; Duplicate the low lanes next to each other, then perform a wider shift-right +;; by the low lane width to move the upper of each pair back into the lower lane +;; of each pair, achieving the widening of the lower lanes. +(rule (lower_swiden_low $I16X8 val) + (x64_psraw (x64_punpcklbw val val) (xmi_imm 8))) +(rule (lower_swiden_low $I32X4 val) + (x64_psrad (x64_punpcklwd val val) (xmi_imm 16))) + +;; Generate the sign-extended halves with a `val < 0` comparison (expressed +;; reversed here), then interleave the low 32-bit halves to create the full +;; 64-bit results. +(rule (lower_swiden_low $I64X2 val) + (let ((tmp Xmm (x64_pcmpgtd (xmm_zero $I32X4) val))) + (x64_punpckldq val tmp))) ;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule (lower (has_type $I16X8 (swiden_high val @ (value_type $I8X16)))) - (let ((x Xmm val)) - (x64_pmovsxbw (x64_palignr x x 8)))) +;; Similar to `swiden_low` with SSE4.1 except that the upper lanes are moved +;; to the lower lanes first. +(rule 1 (lower (has_type $I16X8 (swiden_high val @ (value_type $I8X16)))) + (if-let $true (use_sse41)) + (let ((x Xmm val)) + (x64_pmovsxbw (x64_palignr x x 8)))) +(rule 1 (lower (has_type $I32X4 (swiden_high val @ (value_type $I16X8)))) + (if-let $true (use_sse41)) + (let ((x Xmm val)) + (x64_pmovsxwd (x64_palignr x x 8)))) +(rule 1 (lower (has_type $I64X2 (swiden_high val @ (value_type $I32X4)))) + (if-let $true (use_sse41)) + (x64_pmovsxdq (x64_pshufd val 0b11_10_11_10))) +;; Similar to `swiden_low` versions but using `punpckh*` instructions to +;; pair the high lanes next to each other. +(rule (lower (has_type $I16X8 (swiden_high val @ (value_type $I8X16)))) + (let ((val Xmm val)) + (x64_psraw (x64_punpckhbw val val) (xmi_imm 8)))) (rule (lower (has_type $I32X4 (swiden_high val @ (value_type $I16X8)))) - (let ((x Xmm val)) - (x64_pmovsxwd (x64_palignr x x 8)))) + (let ((val Xmm val)) + (x64_psrad (x64_punpckhwd val val) (xmi_imm 16)))) +;; Same as `swiden_low`, but `val` has its high lanes moved down. (rule (lower (has_type $I64X2 (swiden_high val @ (value_type $I32X4)))) - (x64_pmovsxdq (x64_pshufd val 0xEE))) + (let ((val Xmm (x64_pshufd val 0b00_00_11_10)) + (tmp Xmm (x64_pcmpgtd (xmm_zero $I32X4) val))) + (x64_punpckldq val tmp))) ;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule (lower (has_type $I16X8 (uwiden_low val @ (value_type $I8X16)))) - (x64_pmovzxbw val)) +;; With SSE4.1 use the `pmovzx*` instructions for this +(rule 1 (lower (has_type $I16X8 (uwiden_low val @ (value_type $I8X16)))) + (if-let $true (use_sse41)) + (x64_pmovzxbw val)) +(rule 1 (lower (has_type $I32X4 (uwiden_low val @ (value_type $I16X8)))) + (if-let $true (use_sse41)) + (x64_pmovzxwd val)) +(rule 1 (lower (has_type $I64X2 (uwiden_low val @ (value_type $I32X4)))) + (if-let $true (use_sse41)) + (x64_pmovzxdq val)) -(rule (lower (has_type $I32X4 (uwiden_low val @ (value_type $I16X8)))) - (x64_pmovzxwd val)) +(rule (lower (has_type ty (uwiden_low val))) (lower_uwiden_low ty val)) -(rule (lower (has_type $I64X2 (uwiden_low val @ (value_type $I32X4)))) - (x64_pmovzxdq val)) +;; Interleave an all-zero register with the low lanes to produce zero-extended +;; results. +(decl lower_uwiden_low (Type Xmm) Xmm) +(rule (lower_uwiden_low $I16X8 val) (x64_punpcklbw val (xmm_zero $I8X16))) +(rule (lower_uwiden_low $I32X4 val) (x64_punpcklwd val (xmm_zero $I8X16))) +(rule (lower_uwiden_low $I64X2 val) (x64_unpcklps val (xmm_zero $F32X4))) ;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Same as `uwiden_high`, but interleaving high lanes instead. +;; +;; Note that according to `llvm-mca` at least these instructions are faster +;; than using `pmovzx*` in terms of cycles, even if SSE4.1 is available. (rule (lower (has_type $I16X8 (uwiden_high val @ (value_type $I8X16)))) - (let ((x Xmm val)) - (x64_pmovzxbw (x64_palignr x x 8)))) - + (x64_punpckhbw val (xmm_zero $I8X16))) (rule (lower (has_type $I32X4 (uwiden_high val @ (value_type $I16X8)))) - (let ((x Xmm val)) - (x64_pmovzxwd (x64_palignr x x 8)))) - + (x64_punpckhwd val (xmm_zero $I8X16))) (rule (lower (has_type $I64X2 (uwiden_high val @ (value_type $I32X4)))) - (x64_pmovzxdq (x64_pshufd val 0xEE))) + (x64_unpckhps val (xmm_zero $F32X4))) ;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif index 40be5e1738bf..8a0381e97141 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif @@ -1204,8 +1204,9 @@ block0(v0: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vpalignr $8, %xmm0, %xmm0, %xmm2 -; vpmovzxbw %xmm2, %xmm0 +; uninit %xmm2 +; vpxor %xmm2, %xmm2, %xmm4 +; vpunpckhbw %xmm0, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -1215,8 +1216,8 @@ block0(v0: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; vpalignr $8, %xmm0, %xmm0, %xmm2 -; vpmovzxbw %xmm2, %xmm0 +; vpxor %xmm2, %xmm2, %xmm4 +; vpunpckhbw %xmm4, %xmm0, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/simd-widen-mul.clif b/cranelift/filetests/filetests/isa/x64/simd-widen-mul.clif index e31b6f133f34..a77753e08b6e 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-widen-mul.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-widen-mul.clif @@ -16,13 +16,13 @@ block0(v0: i8x16, v1: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqa %xmm0, %xmm3 -; palignr $8, %xmm3, %xmm0, %xmm3 -; pmovsxbw %xmm3, %xmm0 -; movdqa %xmm1, %xmm7 -; palignr $8, %xmm7, %xmm1, %xmm7 -; pmovsxbw %xmm7, %xmm9 -; pmullw %xmm0, %xmm9, %xmm0 +; movdqa %xmm0, %xmm6 +; palignr $8, %xmm6, %xmm0, %xmm6 +; pmovsxbw %xmm6, %xmm0 +; movdqa %xmm1, %xmm6 +; palignr $8, %xmm6, %xmm1, %xmm6 +; pmovsxbw %xmm6, %xmm8 +; pmullw %xmm0, %xmm8, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -32,13 +32,13 @@ block0(v0: i8x16, v1: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqa %xmm0, %xmm3 -; palignr $8, %xmm0, %xmm3 -; pmovsxbw %xmm3, %xmm0 -; movdqa %xmm1, %xmm7 -; palignr $8, %xmm1, %xmm7 -; pmovsxbw %xmm7, %xmm9 -; pmullw %xmm9, %xmm0 +; movdqa %xmm0, %xmm6 +; palignr $8, %xmm0, %xmm6 +; pmovsxbw %xmm6, %xmm0 +; movdqa %xmm1, %xmm6 +; palignr $8, %xmm1, %xmm6 +; pmovsxbw %xmm6, %xmm8 +; pmullw %xmm8, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -226,13 +226,14 @@ block0(v0: i8x16, v1: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqa %xmm0, %xmm3 -; palignr $8, %xmm3, %xmm0, %xmm3 -; pmovzxbw %xmm3, %xmm0 -; movdqa %xmm1, %xmm7 -; palignr $8, %xmm7, %xmm1, %xmm7 -; pmovzxbw %xmm7, %xmm9 -; pmullw %xmm0, %xmm9, %xmm0 +; uninit %xmm8 +; pxor %xmm8, %xmm8, %xmm8 +; punpckhbw %xmm0, %xmm8, %xmm0 +; uninit %xmm8 +; pxor %xmm8, %xmm8, %xmm8 +; movdqa %xmm1, %xmm11 +; punpckhbw %xmm11, %xmm8, %xmm11 +; pmullw %xmm0, %xmm11, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -242,13 +243,12 @@ block0(v0: i8x16, v1: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqa %xmm0, %xmm3 -; palignr $8, %xmm0, %xmm3 -; pmovzxbw %xmm3, %xmm0 -; movdqa %xmm1, %xmm7 -; palignr $8, %xmm1, %xmm7 -; pmovzxbw %xmm7, %xmm9 -; pmullw %xmm9, %xmm0 +; pxor %xmm8, %xmm8 +; punpckhbw %xmm8, %xmm0 +; pxor %xmm8, %xmm8 +; movdqa %xmm1, %xmm11 +; punpckhbw %xmm8, %xmm11 +; pmullw %xmm11, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/widen-high-bug.clif b/cranelift/filetests/filetests/isa/x64/widen-high-bug.clif index 9aa77be46c4b..a2b429e55d6c 100644 --- a/cranelift/filetests/filetests/isa/x64/widen-high-bug.clif +++ b/cranelift/filetests/filetests/isa/x64/widen-high-bug.clif @@ -12,9 +12,10 @@ block0(v0: i64, v2: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqu 80(%rdi), %xmm3 -; palignr $8, %xmm3, %xmm3, %xmm3 -; pmovzxbw %xmm3, %xmm0 +; movdqu 80(%rdi), %xmm0 +; uninit %xmm5 +; pxor %xmm5, %xmm5, %xmm5 +; punpckhbw %xmm0, %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -24,9 +25,9 @@ block0(v0: i64, v2: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqu 0x50(%rdi), %xmm3 -; palignr $8, %xmm3, %xmm3 -; pmovzxbw %xmm3, %xmm0 +; movdqu 0x50(%rdi), %xmm0 +; pxor %xmm5, %xmm5 +; punpckhbw %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/widening.clif b/cranelift/filetests/filetests/isa/x64/widening.clif index 44377ad04139..2a79f92ee4f5 100644 --- a/cranelift/filetests/filetests/isa/x64/widening.clif +++ b/cranelift/filetests/filetests/isa/x64/widening.clif @@ -246,9 +246,9 @@ block0(v0: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqa %xmm0, %xmm2 -; palignr $8, %xmm2, %xmm0, %xmm2 -; pmovzxbw %xmm2, %xmm0 +; uninit %xmm3 +; pxor %xmm3, %xmm3, %xmm3 +; punpckhbw %xmm0, %xmm3, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -258,9 +258,8 @@ block0(v0: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqa %xmm0, %xmm2 -; palignr $8, %xmm0, %xmm2 -; pmovzxbw %xmm2, %xmm0 +; pxor %xmm3, %xmm3 +; punpckhbw %xmm3, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -275,9 +274,9 @@ block0(v0: i16x8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqa %xmm0, %xmm2 -; palignr $8, %xmm2, %xmm0, %xmm2 -; pmovzxwd %xmm2, %xmm0 +; uninit %xmm3 +; pxor %xmm3, %xmm3, %xmm3 +; punpckhwd %xmm0, %xmm3, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -287,9 +286,8 @@ block0(v0: i16x8): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqa %xmm0, %xmm2 -; palignr $8, %xmm0, %xmm2 -; pmovzxwd %xmm2, %xmm0 +; pxor %xmm3, %xmm3 +; punpckhwd %xmm3, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -304,8 +302,9 @@ block0(v0: i32x4): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; pshufd $238, %xmm0, %xmm2 -; pmovzxdq %xmm2, %xmm0 +; uninit %xmm3 +; xorps %xmm3, %xmm3, %xmm3 +; unpckhps %xmm0, %xmm3, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -315,8 +314,8 @@ block0(v0: i32x4): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; pshufd $0xee, %xmm0, %xmm2 -; pmovzxdq %xmm2, %xmm0 +; xorps %xmm3, %xmm3 +; unpckhps %xmm3, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif b/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif index b75477f79d51..99a7ea5ad338 100644 --- a/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif +++ b/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif @@ -2,9 +2,11 @@ test interpret test run target aarch64 target s390x +target x86_64 ssse3 has_sse41=false set enable_simd -target x86_64 has_sse3 has_ssse3 has_sse41 -target x86_64 has_sse3 has_ssse3 has_sse41 has_avx +target x86_64 +target x86_64 sse41 +target x86_64 sse41 has_avx function %swidenhigh_i8x16(i8x16) -> i16x8 { block0(v0: i8x16): diff --git a/cranelift/filetests/filetests/runtests/simd-swidenlow.clif b/cranelift/filetests/filetests/runtests/simd-swidenlow.clif index 406b15e41f83..6391a2471c13 100644 --- a/cranelift/filetests/filetests/runtests/simd-swidenlow.clif +++ b/cranelift/filetests/filetests/runtests/simd-swidenlow.clif @@ -2,9 +2,11 @@ test interpret test run target aarch64 target s390x +target x86_64 ssse3 has_sse41=false set enable_simd -target x86_64 has_sse3 has_ssse3 has_sse41 -target x86_64 has_sse3 has_ssse3 has_sse41 has_avx +target x86_64 +target x86_64 sse41 +target x86_64 sse41 has_avx function %swidenlow_i8x16(i8x16) -> i16x8 { block0(v0: i8x16): diff --git a/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif b/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif index 8a8949d24198..38084fac7b32 100644 --- a/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif +++ b/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif @@ -2,9 +2,11 @@ test interpret test run target aarch64 target s390x +target x86_64 ssse3 has_sse41=false set enable_simd -target x86_64 has_sse3 has_ssse3 has_sse41 -target x86_64 has_sse3 has_ssse3 has_sse41 has_avx +target x86_64 +target x86_64 sse41 +target x86_64 sse41 has_avx function %uwidenhigh_i8x16(i8x16) -> i16x8 { block0(v0: i8x16): @@ -12,6 +14,7 @@ block0(v0: i8x16): return v1 } ; run: %uwidenhigh_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [9 10 11 12 13 14 15 16] +; run: %uwidenhigh_i8x16([1 2 3 4 5 6 7 8 9 0x80 10 0xff 11 0x92 12 0x70]) == [9 0x80 10 0xff 11 0x92 12 0x70] function %uwidenhigh_i16x8(i16x8) -> i32x4 { block0(v0: i16x8): @@ -19,6 +22,7 @@ block0(v0: i16x8): return v1 } ; run: %uwidenhigh_i16x8([1 2 3 4 5 6 7 8]) == [5 6 7 8] +; run: %uwidenhigh_i16x8([9 10 11 12 13 14 -1 -2]) == [13 14 0xffff 0xfffe] function %uwidenhigh_i32x4(i32x4) -> i64x2 { block0(v0: i32x4): @@ -26,3 +30,4 @@ block0(v0: i32x4): return v1 } ; run: %uwidenhigh_i32x4([1 2 3 4]) == [3 4] +; run: %uwidenhigh_i32x4([4 5 6 -1]) == [6 0xffffffff] diff --git a/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif b/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif index c0a83ab80926..dda577046528 100644 --- a/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif +++ b/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif @@ -2,9 +2,11 @@ test interpret test run target aarch64 target s390x +target x86_64 ssse3 has_sse41=false set enable_simd -target x86_64 has_sse3 has_ssse3 has_sse41 -target x86_64 has_sse3 has_ssse3 has_sse41 has_avx +target x86_64 +target x86_64 sse41 +target x86_64 sse41 has_avx function %uwidenlow_i8x16(i8x16) -> i16x8 { block0(v0: i8x16): diff --git a/cranelift/filetests/filetests/wasm/x64-relaxed-simd-deterministic.wat b/cranelift/filetests/filetests/wasm/x64-relaxed-simd-deterministic.wat index 5634b4ba217c..2ce1a3ed5c93 100644 --- a/cranelift/filetests/filetests/wasm/x64-relaxed-simd-deterministic.wat +++ b/cranelift/filetests/filetests/wasm/x64-relaxed-simd-deterministic.wat @@ -125,15 +125,15 @@ ;; movq %rsp, %rbp ;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } ;; block0: -;; vpmovsxbw %xmm0, %xmm10 -;; vpmovsxbw %xmm1, %xmm12 -;; vpmullw %xmm10, %xmm12, %xmm14 -;; vpalignr $8, %xmm0, %xmm0, %xmm8 -;; vpmovsxbw %xmm8, %xmm10 -;; vpalignr $8, %xmm1, %xmm1, %xmm12 -;; vpmovsxbw %xmm12, %xmm15 -;; vpmullw %xmm10, %xmm15, %xmm0 -;; vphaddw %xmm14, %xmm0, %xmm0 +;; vpmovsxbw %xmm0, %xmm12 +;; vpmovsxbw %xmm1, %xmm13 +;; vpmullw %xmm12, %xmm13, %xmm12 +;; vpalignr $8, %xmm0, %xmm0, %xmm11 +;; vpmovsxbw %xmm11, %xmm13 +;; vpalignr $8, %xmm1, %xmm1, %xmm11 +;; vpmovsxbw %xmm11, %xmm14 +;; vpmullw %xmm13, %xmm14, %xmm13 +;; vphaddw %xmm12, %xmm13, %xmm0 ;; jmp label1 ;; block1: ;; movq %rbp, %rsp @@ -146,15 +146,15 @@ ;; movq %rsp, %rbp ;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } ;; block0: -;; vpmovsxbw %xmm0, %xmm13 -;; vpmovsxbw %xmm1, %xmm15 -;; vpmullw %xmm13, %xmm15, %xmm3 -;; vpalignr $8, %xmm0, %xmm0, %xmm11 -;; vpmovsxbw %xmm11, %xmm13 -;; vpalignr $8, %xmm1, %xmm1, %xmm15 -;; vpmovsxbw %xmm15, %xmm1 -;; vpmullw %xmm13, %xmm1, %xmm4 -;; vphaddw %xmm3, %xmm4, %xmm15 +;; vpmovsxbw %xmm0, %xmm15 +;; vpmovsxbw %xmm1, %xmm3 +;; vpmullw %xmm15, %xmm3, %xmm15 +;; vpalignr $8, %xmm0, %xmm0, %xmm14 +;; vpmovsxbw %xmm14, %xmm0 +;; vpalignr $8, %xmm1, %xmm1, %xmm14 +;; vpmovsxbw %xmm14, %xmm1 +;; vpmullw %xmm0, %xmm1, %xmm0 +;; vphaddw %xmm15, %xmm0, %xmm15 ;; vpmaddwd %xmm15, const(0), %xmm15 ;; vpaddd %xmm15, %xmm2, %xmm0 ;; jmp label1