From 658da3eb4430ee4743b9a4711f15fa45207da8b6 Mon Sep 17 00:00:00 2001 From: Ulrich Weigand Date: Wed, 10 Aug 2022 21:12:13 +0200 Subject: [PATCH] s390x: Support both big- and little-endian vector lane order This implements the s390x back-end portion of the solution for https://github.com/bytecodealliance/wasmtime/issues/4566 We now support both big- and little-endian vector lane order in code generation. The order used for a function is determined by the function's ABI: if it uses a Wasmtime ABI, it will use little-endian lane order, and big-endian lane order otherwise. (This ensures that all raw_bitcast instructions generated by both wasmtime and other cranelift frontends can always be implemented as a no-op.) Lane order affects the implementation of a number of operations: - Vector immediates - Vector memory load / store (in big- and little-endian variants) - Operations explicitly using lane numbers (insertlane, extractlane, shuffle, swizzle) - Operations implicitly using lane numbers (iadd_pairwise, narrow/widen, promote/demote, fcvt_low, vhigh_bits) In addition, when calling a function using a different lane order, we need to lane-swap all vector values passed or returned in registers. A small number of changes to common code were also needed: - Ensure we always select a Wasmtime calling convention on s390x in crates/cranelift (func_signature). - Fix vector immediates for filetests/runtests. In PR #4427, I attempted to fix this by byte-swapping the V128 value, but with the new scheme, we'd instead need to perform a per-lane byte swap. Since we do not know the actual type in write_to_slice and read_from_slice, this isn't easily possible. Revert this part of PR #4427 again, and instead just mark the memory buffer as little-endian when emitting the trampoline; the back-end will then emit correct code to load the constant. - Change a runtest in simd-bitselect-to-vselect.clif to no longer make little-endian lane order assumptions. - Remove runtests in simd-swizzle.clif that make little-endian lane order assumptions by relying on implicit type conversion when using a non-i16x8 swizzle result type (this feature should probably be removed anyway). Tested with both wasmtime and cg_clif. --- cranelift/codegen/src/data_value.rs | 10 +- cranelift/codegen/src/isa/s390x/inst.isle | 350 ++- cranelift/codegen/src/isa/s390x/inst/emit.rs | 38 +- .../codegen/src/isa/s390x/inst/emit_tests.rs | 468 ++++ cranelift/codegen/src/isa/s390x/inst/mod.rs | 95 +- cranelift/codegen/src/isa/s390x/lower.isle | 434 +++- cranelift/codegen/src/isa/s390x/lower/isle.rs | 69 +- .../filetests/isa/s390x/vec-abi.clif | 127 ++ .../filetests/isa/s390x/vec-arithmetic.clif | 90 +- .../isa/s390x/vec-constants-le-lane.clif | 213 ++ .../filetests/isa/s390x/vec-constants.clif | 10 +- .../isa/s390x/vec-conversions-le-lane.clif | 222 ++ .../filetests/isa/s390x/vec-conversions.clif | 48 +- .../filetests/filetests/isa/s390x/vec-fp.clif | 71 +- .../filetests/isa/s390x/vec-lane-arch13.clif | 120 +- .../isa/s390x/vec-lane-le-lane-arch13.clif | 807 +++++++ .../filetests/isa/s390x/vec-lane-le-lane.clif | 1964 +++++++++++++++++ .../filetests/isa/s390x/vec-lane.clif | 333 ++- .../filetests/isa/s390x/vec-logical.clif | 56 +- .../isa/s390x/vec-permute-le-lane.clif | 493 +++++ .../filetests/isa/s390x/vec-permute.clif | 103 +- .../filetests/isa/s390x/vecmem-arch13.clif | 44 +- .../isa/s390x/vecmem-le-lane-arch13.clif | 379 ++++ .../filetests/isa/s390x/vecmem-le-lane.clif | 494 +++++ .../filetests/filetests/isa/s390x/vecmem.clif | 113 +- .../runtests/simd-bitselect-to-vselect.clif | 3 +- .../filetests/runtests/simd-swizzle.clif | 20 - cranelift/filetests/src/function_runner.rs | 19 +- crates/cranelift/src/lib.rs | 6 +- 29 files changed, 6595 insertions(+), 604 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-abi.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-constants-le-lane.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-conversions-le-lane.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-lane-le-lane-arch13.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-lane-le-lane.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-permute-le-lane.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vecmem-le-lane-arch13.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vecmem-le-lane.clif diff --git a/cranelift/codegen/src/data_value.rs b/cranelift/codegen/src/data_value.rs index 6abc29987b05..104012254f7e 100644 --- a/cranelift/codegen/src/data_value.rs +++ b/cranelift/codegen/src/data_value.rs @@ -91,8 +91,8 @@ impl DataValue { DataValue::I128(i) => dst[..16].copy_from_slice(&i.to_ne_bytes()[..]), DataValue::F32(f) => dst[..4].copy_from_slice(&f.bits().to_ne_bytes()[..]), DataValue::F64(f) => dst[..8].copy_from_slice(&f.bits().to_ne_bytes()[..]), - DataValue::V128(v) => dst[..16].copy_from_slice(&u128::from_le_bytes(*v).to_ne_bytes()), - DataValue::V64(v) => dst[..8].copy_from_slice(&u64::from_le_bytes(*v).to_ne_bytes()), + DataValue::V128(v) => dst[..16].copy_from_slice(&v[..]), + DataValue::V64(v) => dst[..8].copy_from_slice(&v[..]), _ => unimplemented!(), }; } @@ -124,11 +124,9 @@ impl DataValue { } _ if ty.is_vector() => { if ty.bytes() == 16 { - DataValue::V128( - u128::from_ne_bytes(src[..16].try_into().unwrap()).to_le_bytes(), - ) + DataValue::V128(src[..16].try_into().unwrap()) } else if ty.bytes() == 8 { - DataValue::V64(u64::from_ne_bytes(src[..8].try_into().unwrap()).to_le_bytes()) + DataValue::V128(src[..8].try_into().unwrap()) } else { unimplemented!() } diff --git a/cranelift/codegen/src/isa/s390x/inst.isle b/cranelift/codegen/src/isa/s390x/inst.isle index ede3619f849e..bec494e220bf 100644 --- a/cranelift/codegen/src/isa/s390x/inst.isle +++ b/cranelift/codegen/src/isa/s390x/inst.isle @@ -635,6 +635,36 @@ (rd WritableReg) (mem MemArg)) + ;; 8x16-bit byte-reversed vector load instruction. + (VecLoadByte16Rev + (rd WritableReg) + (mem MemArg)) + + ;; 4x32-bit byte-reversed vector load instruction. + (VecLoadByte32Rev + (rd WritableReg) + (mem MemArg)) + + ;; 2x64-bit byte-reversed vector load instruction. + (VecLoadByte64Rev + (rd WritableReg) + (mem MemArg)) + + ;; 8x16-bit element-reversed vector load instruction. + (VecLoadElt16Rev + (rd WritableReg) + (mem MemArg)) + + ;; 4x32-bit element-reversed vector load instruction. + (VecLoadElt32Rev + (rd WritableReg) + (mem MemArg)) + + ;; 2x64-bit element-reversed vector load instruction. + (VecLoadElt64Rev + (rd WritableReg) + (mem MemArg)) + ;; 128-bit vector store instruction. (VecStore (rd Reg) @@ -645,6 +675,36 @@ (rd Reg) (mem MemArg)) + ;; 8x16-bit byte-reversed vector store instruction. + (VecStoreByte16Rev + (rd Reg) + (mem MemArg)) + + ;; 4x32-bit byte-reversed vector store instruction. + (VecStoreByte32Rev + (rd Reg) + (mem MemArg)) + + ;; 2x64-bit byte-reversed vector store instruction. + (VecStoreByte64Rev + (rd Reg) + (mem MemArg)) + + ;; 8x16-bit element-reversed vector store instruction. + (VecStoreElt16Rev + (rd Reg) + (mem MemArg)) + + ;; 4x32-bit element-reversed vector store instruction. + (VecStoreElt32Rev + (rd Reg) + (mem MemArg)) + + ;; 2x64-bit element-reversed vector store instruction. + (VecStoreElt64Rev + (rd Reg) + (mem MemArg)) + ;; 128-bit vector load replicated element instruction. (VecLoadReplicate (size u32) @@ -1350,6 +1410,51 @@ (extern extractor allow_div_traps allow_div_traps) +;; Helpers for SIMD lane number operations ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; There are two ways to map vector types onto the SIMD vector registers +;; supported by the ISA, differing by the way lanes are numbered. In +;; little-endian lane order, lane 0 of a multi-lane vector value resides +;; in the least-significant parts of a vector register (when interpreted +;; as holding a single $I128 value); in big-endian lane order, lane 0 +;; instead resides in the most-significant parts of the register. +;; +;; As long as used consistently, output of cranelift may use either lane +;; order method to implement CLIF semantics. However, depending on the +;; particular use case, one or the other order will lead to more efficient +;; code. Therefore this back end supports both code generation options. +;; +;; Note that the ISA instructions use immediate lane number according +;; to big-endian lane order; so when using little-endian lane order, +;; immediate lane numbers have to be translated. +(type LaneOrder + (enum + (LittleEndian) + (BigEndian))) + +;; Return the lane order to be used when compiling the current function. +;; This will be a property of the function ABI. Functions using the +;; the Wasmtime ABI will use little-endian lane order, functions using +;; other ABIs will big-endian lane order. +(decl pure lane_order () LaneOrder) +(extern constructor lane_order lane_order) + +;; Check whether two lane order values are equal. +(decl pure lane_order_equal (LaneOrder LaneOrder) bool) +(rule (lane_order_equal (LaneOrder.LittleEndian) (LaneOrder.LittleEndian)) $true) +(rule (lane_order_equal (LaneOrder.LittleEndian) (LaneOrder.BigEndian)) $false) +(rule (lane_order_equal (LaneOrder.BigEndian) (LaneOrder.LittleEndian)) $false) +(rule (lane_order_equal (LaneOrder.BigEndian) (LaneOrder.BigEndian)) $true) + +;; Convert a CLIF immediate lane index value to big-endian lane order. +(decl be_lane_idx (Type u8) u8) +(extern constructor be_lane_idx be_lane_idx) + +;; Convert a CLIF immediate vector constant to big-endian lane order. +(decl be_vec_const (Type u128) u128) +(extern constructor be_vec_const be_vec_const) + + ;; Helpers for register numbers and types ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Hard-coded registers. @@ -1441,11 +1546,6 @@ (u32_pair (u16_pair (u8_pair i j) (u8_pair k l)) (u16_pair (u8_pair m n) (u8_pair o p))))) -;; Convert a little-endian lane index to a big-endian lane index. - -(decl be_lane_idx (Type u8) u8) -(extern constructor be_lane_idx be_lane_idx) - ;; Construct a VGBM mask to set all bits in one lane of a vector. (decl lane_byte_mask (Type u8) u16) @@ -2298,6 +2398,48 @@ (_ Unit (emit (MInst.VecLoadRev dst addr)))) dst)) +;; Helper for emitting `MInst.VecLoadByte16Rev` instructions. +(decl vec_load_byte16rev (Type MemArg) Reg) +(rule (vec_load_byte16rev ty addr) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadByte16Rev dst addr)))) + dst)) + +;; Helper for emitting `MInst.VecLoadByte32Rev` instructions. +(decl vec_load_byte32rev (Type MemArg) Reg) +(rule (vec_load_byte32rev ty addr) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadByte32Rev dst addr)))) + dst)) + +;; Helper for emitting `MInst.VecLoadByte64Rev` instructions. +(decl vec_load_byte64rev (Type MemArg) Reg) +(rule (vec_load_byte64rev ty addr) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadByte64Rev dst addr)))) + dst)) + +;; Helper for emitting `MInst.VecLoadElt16Rev` instructions. +(decl vec_load_elt16rev (Type MemArg) Reg) +(rule (vec_load_elt16rev ty addr) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadElt16Rev dst addr)))) + dst)) + +;; Helper for emitting `MInst.VecLoadElt32Rev` instructions. +(decl vec_load_elt32rev (Type MemArg) Reg) +(rule (vec_load_elt32rev ty addr) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadElt32Rev dst addr)))) + dst)) + +;; Helper for emitting `MInst.VecLoadElt64Rev` instructions. +(decl vec_load_elt64rev (Type MemArg) Reg) +(rule (vec_load_elt64rev ty addr) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecLoadElt64Rev dst addr)))) + dst)) + ;; Helper for emitting `MInst.VecStore` instructions. (decl vec_store (Reg MemArg) SideEffectNoResult) (rule (vec_store src addr) @@ -2308,6 +2450,36 @@ (rule (vec_storerev src addr) (SideEffectNoResult.Inst (MInst.VecStoreRev src addr))) +;; Helper for emitting `MInst.VecStoreByte16Rev` instructions. +(decl vec_store_byte16rev (Reg MemArg) SideEffectNoResult) +(rule (vec_store_byte16rev src addr) + (SideEffectNoResult.Inst (MInst.VecStoreByte16Rev src addr))) + +;; Helper for emitting `MInst.VecStoreByte32Rev` instructions. +(decl vec_store_byte32rev (Reg MemArg) SideEffectNoResult) +(rule (vec_store_byte32rev src addr) + (SideEffectNoResult.Inst (MInst.VecStoreByte32Rev src addr))) + +;; Helper for emitting `MInst.VecStoreByte64Rev` instructions. +(decl vec_store_byte64rev (Reg MemArg) SideEffectNoResult) +(rule (vec_store_byte64rev src addr) + (SideEffectNoResult.Inst (MInst.VecStoreByte64Rev src addr))) + +;; Helper for emitting `MInst.VecStoreElt16Rev` instructions. +(decl vec_store_elt16rev (Reg MemArg) SideEffectNoResult) +(rule (vec_store_elt16rev src addr) + (SideEffectNoResult.Inst (MInst.VecStoreElt16Rev src addr))) + +;; Helper for emitting `MInst.VecStoreElt32Rev` instructions. +(decl vec_store_elt32rev (Reg MemArg) SideEffectNoResult) +(rule (vec_store_elt32rev src addr) + (SideEffectNoResult.Inst (MInst.VecStoreElt32Rev src addr))) + +;; Helper for emitting `MInst.VecStoreElt64Rev` instructions. +(decl vec_store_elt64rev (Reg MemArg) SideEffectNoResult) +(rule (vec_store_elt64rev src addr) + (SideEffectNoResult.Inst (MInst.VecStoreElt64Rev src addr))) + ;; Helper for emitting `MInst.VecLoadReplicate` instructions. (decl vec_load_replicate (Type MemArg) Reg) (rule (vec_load_replicate (ty_vec128 ty @ (multi_lane size _)) addr) @@ -2660,6 +2832,34 @@ (rule (emit_arg_load $F64 mem) (vec_load_lane_undef $F64X2 mem 0)) (rule (emit_arg_load (vr128_ty ty) mem) (vec_load ty mem)) +;; Helper to perform a lane swap in register. +(decl vec_elt_rev (Type Reg) Reg) +(rule (vec_elt_rev (multi_lane 64 2) reg) + (vec_permute_dw_imm $I64X2 reg 1 reg 0)) +(rule (vec_elt_rev (multi_lane 32 4) reg) + (let ((rev Reg (vec_permute_dw_imm $I64X2 reg 1 reg 0))) + (vec_rot_imm $I64X2 rev 32))) +(rule (vec_elt_rev (multi_lane 16 8) reg) + (let ((rev Reg (vec_permute_dw_imm $I64X2 reg 1 reg 0))) + (vec_rot_imm $I32X4 (vec_rot_imm $I64X2 rev 32) 16))) +(rule (vec_elt_rev (multi_lane 8 16) reg) + (let ((rev Reg (vec_permute_dw_imm $I64X2 reg 1 reg 0))) + (vec_rot_imm $I16X8 (vec_rot_imm $I32X4 (vec_rot_imm $I64X2 rev 32) 16) 8))) + +;; When passing a vector value in register to a function whose ABI uses +;; a different lane order than the current function, we need to swap lanes. +;; The first operand is the lane order used by the callee. +(decl abi_vec_elt_rev (LaneOrder Type Reg) Reg) +(rule (abi_vec_elt_rev _ (gpr32_ty ty) reg) reg) +(rule (abi_vec_elt_rev _ (gpr64_ty ty) reg) reg) +(rule (abi_vec_elt_rev _ (ty_scalar_float ty) reg) reg) +(rule (abi_vec_elt_rev callee_lane_order _ reg) + (if-let $true (lane_order_equal callee_lane_order (lane_order))) + reg) +(rule (abi_vec_elt_rev callee_lane_order (vr128_ty ty) reg) + (if-let $false (lane_order_equal callee_lane_order (lane_order))) + (vec_elt_rev ty reg)) + ;; Helpers to emit a memory copy (MVC or memcpy libcall). (decl emit_memcpy (MemArg MemArg u64) Unit) (rule (emit_memcpy dst src (len_minus_one len)) @@ -2688,34 +2888,34 @@ ;; Copy a single argument/return value to its slots. ;; For oversized arguments, set the slot to the buffer address. -(decl copy_to_arg (i64 ABIArg Value) Unit) -(rule (copy_to_arg base (abi_arg_only_slot slot) val) - (copy_val_to_arg_slot base slot val)) -(rule (copy_to_arg base (abi_arg_struct_pointer slot offset _) _) +(decl copy_to_arg (LaneOrder i64 ABIArg Value) Unit) +(rule (copy_to_arg lo base (abi_arg_only_slot slot) val) + (copy_val_to_arg_slot lo base slot val)) +(rule (copy_to_arg _ base (abi_arg_struct_pointer slot offset _) _) (let ((ptr Reg (load_addr (memarg_stack_off base offset)))) (copy_reg_to_arg_slot base slot ptr))) -(rule (copy_to_arg base (abi_arg_implicit_pointer slot offset _) _) +(rule (copy_to_arg _ base (abi_arg_implicit_pointer slot offset _) _) (let ((ptr Reg (load_addr (memarg_stack_off base offset)))) (copy_reg_to_arg_slot base slot ptr))) ;; Copy a single argument/return value from its slots. -(decl copy_from_arg (i64 ABIArg) ValueRegs) -(rule (copy_from_arg base (abi_arg_only_slot slot)) - (value_reg (copy_reg_from_arg_slot base slot))) +(decl copy_from_arg (LaneOrder i64 ABIArg) ValueRegs) +(rule (copy_from_arg lo base (abi_arg_only_slot slot)) + (value_reg (copy_reg_from_arg_slot lo base slot))) ;; Copy one component of an argument/return value to its slot. -(decl copy_val_to_arg_slot (i64 ABIArgSlot Value) Unit) -(rule (copy_val_to_arg_slot _ (ABIArgSlot.Reg reg ty (ArgumentExtension.None)) val) - (emit_mov ty (real_reg_to_writable_reg reg) val)) -(rule (copy_val_to_arg_slot _ (ABIArgSlot.Reg reg _ (ArgumentExtension.Uext)) val) +(decl copy_val_to_arg_slot (LaneOrder i64 ABIArgSlot Value) Unit) +(rule (copy_val_to_arg_slot lo _ (ABIArgSlot.Reg reg ty (ArgumentExtension.None)) val) + (emit_mov ty (real_reg_to_writable_reg reg) (abi_vec_elt_rev lo ty val))) +(rule (copy_val_to_arg_slot _ _ (ABIArgSlot.Reg reg _ (ArgumentExtension.Uext)) val) (emit_put_in_reg_zext64 (real_reg_to_writable_reg reg) val)) -(rule (copy_val_to_arg_slot _ (ABIArgSlot.Reg reg _ (ArgumentExtension.Sext)) val) +(rule (copy_val_to_arg_slot _ _ (ABIArgSlot.Reg reg _ (ArgumentExtension.Sext)) val) (emit_put_in_reg_sext64 (real_reg_to_writable_reg reg) val)) -(rule (copy_val_to_arg_slot base (ABIArgSlot.Stack offset ty (ArgumentExtension.None)) val) +(rule (copy_val_to_arg_slot _ base (ABIArgSlot.Stack offset ty (ArgumentExtension.None)) val) (emit_arg_store ty val (memarg_stack_off base offset))) -(rule (copy_val_to_arg_slot base (ABIArgSlot.Stack offset _ (ArgumentExtension.Uext)) val) +(rule (copy_val_to_arg_slot _ base (ABIArgSlot.Stack offset _ (ArgumentExtension.Uext)) val) (emit_arg_store $I64 (put_in_reg_zext64 val) (memarg_stack_off base offset))) -(rule (copy_val_to_arg_slot base (ABIArgSlot.Stack offset _ (ArgumentExtension.Sext)) val) +(rule (copy_val_to_arg_slot _ base (ABIArgSlot.Stack offset _ (ArgumentExtension.Sext)) val) (emit_arg_store $I64 (put_in_reg_sext64 val) (memarg_stack_off base offset))) ;; Copy one component of an argument/return value to its slot, where the @@ -2727,10 +2927,10 @@ (emit_arg_store (abi_ext_ty ext ty) src (memarg_stack_off base offset))) ;; Copy one component of an argument/return value from its slot. -(decl copy_reg_from_arg_slot (i64 ABIArgSlot) Reg) -(rule (copy_reg_from_arg_slot _ (ABIArgSlot.Reg reg ty ext)) - (copy_reg (abi_ext_ty ext ty) (real_reg_to_reg reg))) -(rule (copy_reg_from_arg_slot base (ABIArgSlot.Stack offset ty ext)) +(decl copy_reg_from_arg_slot (LaneOrder i64 ABIArgSlot) Reg) +(rule (copy_reg_from_arg_slot lo _ (ABIArgSlot.Reg reg ty ext)) + (abi_vec_elt_rev lo ty (copy_reg (abi_ext_ty ext ty) (real_reg_to_reg reg)))) +(rule (copy_reg_from_arg_slot _ base (ABIArgSlot.Stack offset ty ext)) (emit_arg_load (abi_ext_ty ext ty) (memarg_stack_off base offset))) ;; Helper to compute the type of an implicitly extended argument/return value. @@ -2882,13 +3082,6 @@ (rule (ty_ext64 $I32) $I64) (rule (ty_ext64 $I64) $I64) -;; 128-bit vector type with lane type `Type`. -(decl ty_vec128_from_lane_ty (Type) Type) -(rule (ty_vec128_from_lane_ty $I8) $I8X16) -(rule (ty_vec128_from_lane_ty $I16) $I16X8) -(rule (ty_vec128_from_lane_ty $I32) $I32X4) -(rule (ty_vec128_from_lane_ty $I64) $I64X2) - ;; Zero-extend a register from a smaller `Type` into a 32-bit destination. (Non-SSA form.) ;; This handles both integer and boolean input types. (decl emit_zext32_reg (WritableReg Type Reg) Unit) @@ -3440,6 +3633,9 @@ (decl abi_accumulate_outgoing_args_size (ABISig) Unit) (extern constructor abi_accumulate_outgoing_args_size abi_accumulate_outgoing_args_size) +(decl abi_lane_order (ABISig) LaneOrder) +(extern constructor abi_lane_order abi_lane_order) + ;; Helpers for generating calls to library routines ;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3525,6 +3721,72 @@ (decl vec_unpacku_high (Type Reg) Reg) (rule (vec_unpacku_high ty x) (vec_rr ty (vecop_unpacku_high ty) x)) +;; Versions of pack using current lane order semantics. +;; First source operand contains values that will end up in the +;; lower-numbered lanes of the result, second operand contains +;; values that will end up in the higher-numbered lanes. + +(decl vec_pack_lane_order (Type Reg Reg) Reg) +(rule (vec_pack_lane_order ty x y) + (if-let (LaneOrder.BigEndian) (lane_order)) + (vec_pack ty x y)) +(rule (vec_pack_lane_order ty x y) + (if-let (LaneOrder.LittleEndian) (lane_order)) + (vec_pack ty y x)) + +(decl vec_pack_ssat_lane_order (Type Reg Reg) Reg) +(rule (vec_pack_ssat_lane_order ty x y) + (if-let (LaneOrder.BigEndian) (lane_order)) + (vec_pack_ssat ty x y)) +(rule (vec_pack_ssat_lane_order ty x y) + (if-let (LaneOrder.LittleEndian) (lane_order)) + (vec_pack_ssat ty y x)) + +(decl vec_pack_usat_lane_order (Type Reg Reg) Reg) +(rule (vec_pack_usat_lane_order ty x y) + (if-let (LaneOrder.BigEndian) (lane_order)) + (vec_pack_usat ty x y)) +(rule (vec_pack_usat_lane_order ty x y) + (if-let (LaneOrder.LittleEndian) (lane_order)) + (vec_pack_usat ty y x)) + +;; Versions of unpack using current lane order semantics. +;; unpack_low will consume values from the lower-numbered +;; lanes of the input, and unpack_high will consume values +;; from higher-numbered lanes. + +(decl vec_unpacks_low_lane_order (Type Reg) Reg) +(rule (vec_unpacks_low_lane_order ty x) + (if-let (LaneOrder.BigEndian) (lane_order)) + (vec_unpacks_high ty x)) +(rule (vec_unpacks_low_lane_order ty x) + (if-let (LaneOrder.LittleEndian) (lane_order)) + (vec_unpacks_low ty x)) + +(decl vec_unpacks_high_lane_order (Type Reg) Reg) +(rule (vec_unpacks_high_lane_order ty x) + (if-let (LaneOrder.BigEndian) (lane_order)) + (vec_unpacks_low ty x)) +(rule (vec_unpacks_high_lane_order ty x) + (if-let (LaneOrder.LittleEndian) (lane_order)) + (vec_unpacks_high ty x)) + +(decl vec_unpacku_low_lane_order (Type Reg) Reg) +(rule (vec_unpacku_low_lane_order ty x) + (if-let (LaneOrder.BigEndian) (lane_order)) + (vec_unpacku_high ty x)) +(rule (vec_unpacku_low_lane_order ty x) + (if-let (LaneOrder.LittleEndian) (lane_order)) + (vec_unpacku_low ty x)) + +(decl vec_unpacku_high_lane_order (Type Reg) Reg) +(rule (vec_unpacku_high_lane_order ty x) + (if-let (LaneOrder.BigEndian) (lane_order)) + (vec_unpacku_low ty x)) +(rule (vec_unpacku_high_lane_order ty x) + (if-let (LaneOrder.LittleEndian) (lane_order)) + (vec_unpacku_high ty x)) + ;; Helpers for generating vector merge instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3546,6 +3808,30 @@ (decl vec_merge_high (Type Reg Reg) Reg) (rule (vec_merge_high ty x y) (vec_rrr ty (vecop_merge_high ty) x y)) +;; Versions of merge using current lane order semantics. +;; merge_low will consume values from the lower-numbered +;; lanes of the inputs, and merge_high will consume values +;; from higher-numbered lanes. In both cases, values from +;; the first input will end up in even-numbered lanes, and +;; values from the second input will end up in odd-numbered +;; lanes of the output. + +(decl vec_merge_low_lane_order (Type Reg Reg) Reg) +(rule (vec_merge_low_lane_order ty x y) + (if-let (LaneOrder.BigEndian) (lane_order)) + (vec_merge_high ty x y)) +(rule (vec_merge_low_lane_order ty x y) + (if-let (LaneOrder.LittleEndian) (lane_order)) + (vec_merge_low ty y x)) + +(decl vec_merge_high_lane_order (Type Reg Reg) Reg) +(rule (vec_merge_high_lane_order ty x y) + (if-let (LaneOrder.BigEndian) (lane_order)) + (vec_merge_low ty x y)) +(rule (vec_merge_high_lane_order ty x y) + (if-let (LaneOrder.LittleEndian) (lane_order)) + (vec_merge_high ty y x)) + ;; Helpers for generating `clz` and `ctz` instructions ;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/codegen/src/isa/s390x/inst/emit.rs b/cranelift/codegen/src/isa/s390x/inst/emit.rs index 4deb69aae19d..e18430a7a072 100644 --- a/cranelift/codegen/src/isa/s390x/inst/emit.rs +++ b/cranelift/codegen/src/isa/s390x/inst/emit.rs @@ -2839,24 +2839,50 @@ impl MachInstEmit for Inst { inst.emit(&[], sink, emit_info, state); } - &Inst::VecLoad { rd, ref mem } | &Inst::VecLoadRev { rd, ref mem } => { + &Inst::VecLoad { rd, ref mem } + | &Inst::VecLoadRev { rd, ref mem } + | &Inst::VecLoadByte16Rev { rd, ref mem } + | &Inst::VecLoadByte32Rev { rd, ref mem } + | &Inst::VecLoadByte64Rev { rd, ref mem } + | &Inst::VecLoadElt16Rev { rd, ref mem } + | &Inst::VecLoadElt32Rev { rd, ref mem } + | &Inst::VecLoadElt64Rev { rd, ref mem } => { let rd = allocs.next_writable(rd); let mem = mem.with_allocs(&mut allocs); let (opcode, m3) = match self { - &Inst::VecLoad { .. } => (0xe706, 0), // VL - &Inst::VecLoadRev { .. } => (0xe606, 4), // VLBRQ + &Inst::VecLoad { .. } => (0xe706, 0), // VL + &Inst::VecLoadRev { .. } => (0xe606, 4), // VLBRQ + &Inst::VecLoadByte16Rev { .. } => (0xe606, 1), // VLBRH + &Inst::VecLoadByte32Rev { .. } => (0xe606, 2), // VLBRF + &Inst::VecLoadByte64Rev { .. } => (0xe606, 3), // VLBRG + &Inst::VecLoadElt16Rev { .. } => (0xe607, 1), // VLERH + &Inst::VecLoadElt32Rev { .. } => (0xe607, 2), // VLERF + &Inst::VecLoadElt64Rev { .. } => (0xe607, 3), // VLERG _ => unreachable!(), }; mem_vrx_emit(rd.to_reg(), &mem, opcode, m3, true, sink, emit_info, state); } - &Inst::VecStore { rd, ref mem } | &Inst::VecStoreRev { rd, ref mem } => { + &Inst::VecStore { rd, ref mem } + | &Inst::VecStoreRev { rd, ref mem } + | &Inst::VecStoreByte16Rev { rd, ref mem } + | &Inst::VecStoreByte32Rev { rd, ref mem } + | &Inst::VecStoreByte64Rev { rd, ref mem } + | &Inst::VecStoreElt16Rev { rd, ref mem } + | &Inst::VecStoreElt32Rev { rd, ref mem } + | &Inst::VecStoreElt64Rev { rd, ref mem } => { let rd = allocs.next(rd); let mem = mem.with_allocs(&mut allocs); let (opcode, m3) = match self { - &Inst::VecStore { .. } => (0xe70e, 0), // VST - &Inst::VecStoreRev { .. } => (0xe60e, 4), // VSTBRQ + &Inst::VecStore { .. } => (0xe70e, 0), // VST + &Inst::VecStoreRev { .. } => (0xe60e, 4), // VSTBRQ + &Inst::VecStoreByte16Rev { .. } => (0xe60e, 1), // VSTBRH + &Inst::VecStoreByte32Rev { .. } => (0xe60e, 2), // VSTBRF + &Inst::VecStoreByte64Rev { .. } => (0xe60e, 3), // VSTBRG + &Inst::VecStoreElt16Rev { .. } => (0xe60f, 1), // VSTERH + &Inst::VecStoreElt32Rev { .. } => (0xe60f, 2), // VSTERF + &Inst::VecStoreElt64Rev { .. } => (0xe60f, 3), // VSTERG _ => unreachable!(), }; mem_vrx_emit(rd, &mem, opcode, m3, true, sink, emit_info, state); diff --git a/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs b/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs index 9f593d851024..c1cbe87f8849 100644 --- a/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs @@ -10091,6 +10091,240 @@ fn test_s390x_binemit() { "E61230004806", "vlbrq %v17, 0(%r2,%r3)", )); + insns.push(( + Inst::VecLoadByte16Rev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E61020001806", + "vlbrh %v17, 0(%r2)", + )); + insns.push(( + Inst::VecLoadByte16Rev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E6102FFF1806", + "vlbrh %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecLoadByte16Rev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E61230001806", + "vlbrh %v17, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadByte32Rev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E61020002806", + "vlbrf %v17, 0(%r2)", + )); + insns.push(( + Inst::VecLoadByte32Rev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E6102FFF2806", + "vlbrf %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecLoadByte32Rev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E61230002806", + "vlbrf %v17, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadByte64Rev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E61020003806", + "vlbrg %v17, 0(%r2)", + )); + insns.push(( + Inst::VecLoadByte64Rev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E6102FFF3806", + "vlbrg %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecLoadByte64Rev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E61230003806", + "vlbrg %v17, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadElt16Rev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E61020001807", + "vlerh %v17, 0(%r2)", + )); + insns.push(( + Inst::VecLoadElt16Rev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E6102FFF1807", + "vlerh %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecLoadElt16Rev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E61230001807", + "vlerh %v17, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadElt32Rev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E61020002807", + "vlerf %v17, 0(%r2)", + )); + insns.push(( + Inst::VecLoadElt32Rev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E6102FFF2807", + "vlerf %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecLoadElt32Rev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E61230002807", + "vlerf %v17, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecLoadElt64Rev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E61020003807", + "vlerg %v17, 0(%r2)", + )); + insns.push(( + Inst::VecLoadElt64Rev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E6102FFF3807", + "vlerg %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecLoadElt64Rev { + rd: writable_vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E61230003807", + "vlerg %v17, 0(%r2,%r3)", + )); insns.push(( Inst::VecStore { rd: vr(17), @@ -10169,6 +10403,240 @@ fn test_s390x_binemit() { "E6123000480E", "vstbrq %v17, 0(%r2,%r3)", )); + insns.push(( + Inst::VecStoreByte16Rev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E6102000180E", + "vstbrh %v17, 0(%r2)", + )); + insns.push(( + Inst::VecStoreByte16Rev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E6102FFF180E", + "vstbrh %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecStoreByte16Rev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E6123000180E", + "vstbrh %v17, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecStoreByte32Rev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E6102000280E", + "vstbrf %v17, 0(%r2)", + )); + insns.push(( + Inst::VecStoreByte32Rev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E6102FFF280E", + "vstbrf %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecStoreByte32Rev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E6123000280E", + "vstbrf %v17, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecStoreByte64Rev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E6102000380E", + "vstbrg %v17, 0(%r2)", + )); + insns.push(( + Inst::VecStoreByte64Rev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E6102FFF380E", + "vstbrg %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecStoreByte64Rev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E6123000380E", + "vstbrg %v17, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecStoreElt16Rev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E6102000180F", + "vsterh %v17, 0(%r2)", + )); + insns.push(( + Inst::VecStoreElt16Rev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E6102FFF180F", + "vsterh %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecStoreElt16Rev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E6123000180F", + "vsterh %v17, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecStoreElt32Rev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E6102000280F", + "vsterf %v17, 0(%r2)", + )); + insns.push(( + Inst::VecStoreElt32Rev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E6102FFF280F", + "vsterf %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecStoreElt32Rev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E6123000280F", + "vsterf %v17, 0(%r2,%r3)", + )); + insns.push(( + Inst::VecStoreElt64Rev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E6102000380F", + "vsterg %v17, 0(%r2)", + )); + insns.push(( + Inst::VecStoreElt64Rev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(2), + index: zero_reg(), + disp: UImm12::maybe_from_u64(4095).unwrap(), + flags: MemFlags::trusted(), + }, + }, + "E6102FFF380F", + "vsterg %v17, 4095(%r2)", + )); + insns.push(( + Inst::VecStoreElt64Rev { + rd: vr(17), + mem: MemArg::BXD12 { + base: gpr(3), + index: gpr(2), + disp: UImm12::zero(), + flags: MemFlags::trusted(), + }, + }, + "E6123000380F", + "vsterg %v17, 0(%r2,%r3)", + )); insns.push(( Inst::VecLoadReplicate { size: 8, diff --git a/cranelift/codegen/src/isa/s390x/inst/mod.rs b/cranelift/codegen/src/isa/s390x/inst/mod.rs index e561a688b4d7..9f4ba781d7a9 100644 --- a/cranelift/codegen/src/isa/s390x/inst/mod.rs +++ b/cranelift/codegen/src/isa/s390x/inst/mod.rs @@ -28,8 +28,9 @@ mod emit_tests; // Instructions (top level): definition pub use crate::isa::s390x::lower::isle::generated_code::{ - ALUOp, CmpOp, FPUOp1, FPUOp2, FPUOp3, FpuRoundMode, FpuRoundOp, MInst as Inst, RxSBGOp, - ShiftOp, SymbolReloc, UnaryOp, VecBinaryOp, VecFloatCmpOp, VecIntCmpOp, VecShiftOp, VecUnaryOp, + ALUOp, CmpOp, FPUOp1, FPUOp2, FPUOp3, FpuRoundMode, FpuRoundOp, LaneOrder, MInst as Inst, + RxSBGOp, ShiftOp, SymbolReloc, UnaryOp, VecBinaryOp, VecFloatCmpOp, VecIntCmpOp, VecShiftOp, + VecUnaryOp, }; /// Additional information for (direct) Call instructions, left out of line to lower the size of @@ -245,7 +246,19 @@ impl Inst { // These are all part of VXRS_EXT2 Inst::VecLoadRev { .. } + | Inst::VecLoadByte16Rev { .. } + | Inst::VecLoadByte32Rev { .. } + | Inst::VecLoadByte64Rev { .. } + | Inst::VecLoadElt16Rev { .. } + | Inst::VecLoadElt32Rev { .. } + | Inst::VecLoadElt64Rev { .. } | Inst::VecStoreRev { .. } + | Inst::VecStoreByte16Rev { .. } + | Inst::VecStoreByte32Rev { .. } + | Inst::VecStoreByte64Rev { .. } + | Inst::VecStoreElt16Rev { .. } + | Inst::VecStoreElt32Rev { .. } + | Inst::VecStoreElt64Rev { .. } | Inst::VecLoadReplicateRev { .. } | Inst::VecLoadLaneRev { .. } | Inst::VecLoadLaneRevUndef { .. } @@ -762,6 +775,30 @@ fn s390x_get_operands VReg>(inst: &Inst, collector: &mut OperandC collector.reg_def(rd); memarg_operands(mem, collector); } + &Inst::VecLoadByte16Rev { rd, ref mem, .. } => { + collector.reg_def(rd); + memarg_operands(mem, collector); + } + &Inst::VecLoadByte32Rev { rd, ref mem, .. } => { + collector.reg_def(rd); + memarg_operands(mem, collector); + } + &Inst::VecLoadByte64Rev { rd, ref mem, .. } => { + collector.reg_def(rd); + memarg_operands(mem, collector); + } + &Inst::VecLoadElt16Rev { rd, ref mem, .. } => { + collector.reg_def(rd); + memarg_operands(mem, collector); + } + &Inst::VecLoadElt32Rev { rd, ref mem, .. } => { + collector.reg_def(rd); + memarg_operands(mem, collector); + } + &Inst::VecLoadElt64Rev { rd, ref mem, .. } => { + collector.reg_def(rd); + memarg_operands(mem, collector); + } &Inst::VecStore { rd, ref mem, .. } => { collector.reg_use(rd); memarg_operands(mem, collector); @@ -770,6 +807,30 @@ fn s390x_get_operands VReg>(inst: &Inst, collector: &mut OperandC collector.reg_use(rd); memarg_operands(mem, collector); } + &Inst::VecStoreByte16Rev { rd, ref mem, .. } => { + collector.reg_use(rd); + memarg_operands(mem, collector); + } + &Inst::VecStoreByte32Rev { rd, ref mem, .. } => { + collector.reg_use(rd); + memarg_operands(mem, collector); + } + &Inst::VecStoreByte64Rev { rd, ref mem, .. } => { + collector.reg_use(rd); + memarg_operands(mem, collector); + } + &Inst::VecStoreElt16Rev { rd, ref mem, .. } => { + collector.reg_use(rd); + memarg_operands(mem, collector); + } + &Inst::VecStoreElt32Rev { rd, ref mem, .. } => { + collector.reg_use(rd); + memarg_operands(mem, collector); + } + &Inst::VecStoreElt64Rev { rd, ref mem, .. } => { + collector.reg_use(rd); + memarg_operands(mem, collector); + } &Inst::VecLoadReplicate { rd, ref mem, .. } => { collector.reg_def(rd); memarg_operands(mem, collector); @@ -2476,10 +2537,23 @@ impl Inst { op, rm, rn, tmp, rn, rm ) } - &Inst::VecLoad { rd, ref mem } | &Inst::VecLoadRev { rd, ref mem } => { + &Inst::VecLoad { rd, ref mem } + | &Inst::VecLoadRev { rd, ref mem } + | &Inst::VecLoadByte16Rev { rd, ref mem } + | &Inst::VecLoadByte32Rev { rd, ref mem } + | &Inst::VecLoadByte64Rev { rd, ref mem } + | &Inst::VecLoadElt16Rev { rd, ref mem } + | &Inst::VecLoadElt32Rev { rd, ref mem } + | &Inst::VecLoadElt64Rev { rd, ref mem } => { let opcode = match self { &Inst::VecLoad { .. } => "vl", &Inst::VecLoadRev { .. } => "vlbrq", + &Inst::VecLoadByte16Rev { .. } => "vlbrh", + &Inst::VecLoadByte32Rev { .. } => "vlbrf", + &Inst::VecLoadByte64Rev { .. } => "vlbrg", + &Inst::VecLoadElt16Rev { .. } => "vlerh", + &Inst::VecLoadElt32Rev { .. } => "vlerf", + &Inst::VecLoadElt64Rev { .. } => "vlerg", _ => unreachable!(), }; @@ -2489,10 +2563,23 @@ impl Inst { let mem = mem.pretty_print_default(); format!("{}{} {}, {}", mem_str, opcode, rd, mem) } - &Inst::VecStore { rd, ref mem } | &Inst::VecStoreRev { rd, ref mem } => { + &Inst::VecStore { rd, ref mem } + | &Inst::VecStoreRev { rd, ref mem } + | &Inst::VecStoreByte16Rev { rd, ref mem } + | &Inst::VecStoreByte32Rev { rd, ref mem } + | &Inst::VecStoreByte64Rev { rd, ref mem } + | &Inst::VecStoreElt16Rev { rd, ref mem } + | &Inst::VecStoreElt32Rev { rd, ref mem } + | &Inst::VecStoreElt64Rev { rd, ref mem } => { let opcode = match self { &Inst::VecStore { .. } => "vst", &Inst::VecStoreRev { .. } => "vstbrq", + &Inst::VecStoreByte16Rev { .. } => "vstbrh", + &Inst::VecStoreByte32Rev { .. } => "vstbrf", + &Inst::VecStoreByte64Rev { .. } => "vstbrg", + &Inst::VecStoreElt16Rev { .. } => "vsterh", + &Inst::VecStoreElt32Rev { .. } => "vsterf", + &Inst::VecStoreElt64Rev { .. } => "vsterg", _ => unreachable!(), }; diff --git a/cranelift/codegen/src/isa/s390x/lower.isle b/cranelift/codegen/src/isa/s390x/lower.isle index 495ea33a24c8..3848ce375eeb 100644 --- a/cranelift/codegen/src/isa/s390x/lower.isle +++ b/cranelift/codegen/src/isa/s390x/lower.isle @@ -39,7 +39,7 @@ ;;;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type ty (vconst (u128_from_constant x)))) - (vec_imm ty x)) + (vec_imm ty (be_vec_const ty x))) ;;;; Rules for `null` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -148,9 +148,9 @@ ;; Lane-wise integer pairwise addition for 8-/16/32-bit vector registers. (rule (lower (has_type ty @ (multi_lane bits _) (iadd_pairwise x y))) (let ((size Reg (vec_imm_splat $I8X16 (u32_as_u64 bits)))) - (vec_pack (vec_widen_type ty) - (vec_add ty y (vec_lshr_by_byte y size)) - (vec_add ty x (vec_lshr_by_byte x size))))) + (vec_pack_lane_order (vec_widen_type ty) + (vec_add ty x (vec_lshr_by_byte x size)) + (vec_add ty y (vec_lshr_by_byte y size))))) ;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -929,9 +929,14 @@ (put_in_reg_zext64 x)) ;; 128-bit target types. -(rule (lower (has_type (vr128_ty _ty) (uextend x @ (value_type src_ty)))) - (let ((ty Type (ty_vec128_from_lane_ty src_ty))) - (vec_insert_lane ty (vec_imm ty 0) x (be_lane_idx ty 0) (zero_reg)))) +(rule (lower (has_type (vr128_ty ty) (uextend x @ (value_type $I8)))) + (vec_insert_lane $I8X16 (vec_imm ty 0) x 15 (zero_reg))) +(rule (lower (has_type (vr128_ty ty) (uextend x @ (value_type $I16)))) + (vec_insert_lane $I16X8 (vec_imm ty 0) x 7 (zero_reg))) +(rule (lower (has_type (vr128_ty ty) (uextend x @ (value_type $I32)))) + (vec_insert_lane $I32X4 (vec_imm ty 0) x 3 (zero_reg))) +(rule (lower (has_type (vr128_ty ty) (uextend x @ (value_type $I64)))) + (vec_insert_lane $I64X2 (vec_imm ty 0) x 1 (zero_reg))) ;;;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -953,44 +958,44 @@ ;;;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (snarrow x @ (value_type (ty_vec128 ty)) y)) - (vec_pack_ssat ty y x)) + (vec_pack_ssat_lane_order ty x y)) ;;;; Rules for `uunarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (uunarrow x @ (value_type (ty_vec128 ty)) y)) - (vec_pack_usat ty y x)) + (vec_pack_usat_lane_order ty x y)) ;;;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (unarrow x @ (value_type (ty_vec128 ty)) y)) (let ((zero Reg (vec_imm ty 0))) - (vec_pack_usat ty (vec_smax ty y zero) (vec_smax ty x zero)))) + (vec_pack_usat_lane_order ty (vec_smax ty x zero) (vec_smax ty y zero)))) ;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (swiden_low x @ (value_type (ty_vec128 ty)))) - (vec_unpacks_low ty x)) + (vec_unpacks_low_lane_order ty x)) ;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (swiden_high x @ (value_type (ty_vec128 ty)))) - (vec_unpacks_high ty x)) + (vec_unpacks_high_lane_order ty x)) ;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (uwiden_low x @ (value_type (ty_vec128 ty)))) - (vec_unpacku_low ty x)) + (vec_unpacku_low_lane_order ty x)) ;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (uwiden_high x @ (value_type (ty_vec128 ty)))) - (vec_unpacku_high ty x)) + (vec_unpacku_high_lane_order ty x)) ;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1583,7 +1588,7 @@ ;; Promote a register. (rule (lower (has_type $F64X2 (fvpromote_low x @ (value_type $F32X4)))) - (fpromote_reg $F64X2 $F32X4 (vec_merge_low $I32X4 x x))) + (fpromote_reg $F64X2 $F32X4 (vec_merge_low_lane_order $I32X4 x x))) ;;;; Rules for `fdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1598,9 +1603,8 @@ ;; Demote a register. (rule (lower (has_type $F32X4 (fvdemote x @ (value_type $F64X2)))) (let ((dst Reg (fdemote_reg $F32X4 $F64X2 (FpuRoundMode.Current) x))) - (vec_permute $F32X4 dst (vec_imm $F32X4 0) - (vec_imm $I8X16 (imm8x16 16 16 16 16 16 16 16 16 - 0 1 2 3 8 9 10 11))))) + (vec_pack_lane_order $I64X2 (vec_lshr_imm $I64X2 dst 32) + (vec_imm $I64X2 0)))) ;;;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1690,7 +1694,7 @@ ;; Convert the low half of a $I32X4 to a $F64X2. (rule (lower (has_type $F64X2 (fcvt_low_from_sint x @ (value_type $I32X4)))) (fcvt_from_sint_reg $F64X2 (FpuRoundMode.ToNearestTiesToEven) - (vec_unpacks_low $I32X4 x))) + (vec_unpacks_low_lane_order $I32X4 x))) ;;;; Rules for `fcvt_to_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1845,7 +1849,12 @@ ;;;; Rules for `raw_bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Raw bitcast is always a no-op. +;; FIXME: There are two flavors of raw_bitcast, which are currently not +;; distinguished in CLIF IR. Those generated by Wasmtime assume little-endian +;; lane order, and those generated elsewhere assume big-endian lane order. +;; Raw bitcast is a no-op if current lane order matches that assumed lane order. +;; However, due to our choice of lane order depending on the current function +;; ABI, every bitcast we currently see here is indeed a no-op. (rule (lower (raw_bitcast x)) x) @@ -2352,9 +2361,20 @@ ;;;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; We need to modify the lane mask at runtime in two ways: -;; - convert from little-endian to big-endian lane numbering -;; - handle mask elements outside the range 0..15 by zeroing the lane +;; When using big-endian lane order, the lane mask is mostly correct, but we +;; need to handle mask elements outside the range 0..15 by zeroing the lane. +;; +;; To do so efficiently, we compute: +;; permute-lane-element := umin (16, swizzle-lane-element) +;; and pass a zero vector as second operand to the permute instruction. + +(rule (lower (has_type (ty_vec128 ty) (swizzle x y))) + (if-let (LaneOrder.BigEndian) (lane_order)) + (vec_permute ty x (vec_imm ty 0) + (vec_umin $I8X16 (vec_imm_splat $I8X16 16) y))) + +;; When using little-endian lane order, in addition to zeroing (as above), +;; we need to convert from little-endian to big-endian lane numbering. ;; ;; To do so efficiently, we compute: ;; permute-lane-element := umax (239, ~ swizzle-lane-element) @@ -2368,6 +2388,7 @@ ;; to implement the required swizzle semantics. (rule (lower (has_type (ty_vec128 ty) (swizzle x y))) + (if-let (LaneOrder.LittleEndian) (lane_order)) (vec_permute ty (vec_imm ty 0) x (vec_umax $I8X16 (vec_imm_splat $I8X16 239) (vec_not $I8X16 y)))) @@ -2485,18 +2506,36 @@ (rule (lower (has_type $F64 (load flags @ (littleendian) addr offset))) (vec_load_lane_little_undef $F64X2 (lower_address flags addr offset) 0)) -;; Load 128-bit big-endian vector values. +;; Load 128-bit big-endian vector values, BE lane order - direct load. (rule (lower (has_type (vr128_ty ty) (load flags @ (bigendian) addr offset))) + (if-let (LaneOrder.BigEndian) (lane_order)) (vec_load ty (lower_address flags addr offset))) -;; Load 128-bit little-endian vector values (z15 instruction). -(rule (lower (has_type (and (vxrs_ext2_enabled) (vr128_ty ty)) - (load flags @ (littleendian) addr offset))) +;; Load 128-bit little-endian vector values, BE lane order - byte-reversed load. +(rule (lower (has_type (vr128_ty ty) (load flags @ (littleendian) addr offset))) + (if-let (LaneOrder.BigEndian) (lane_order)) + (vec_load_byte_rev ty flags addr offset)) + +;; Load 128-bit big-endian vector values, LE lane order - element-reversed load. +(rule (lower (has_type (vr128_ty ty) (load flags @ (bigendian) addr offset))) + (if-let (LaneOrder.LittleEndian) (lane_order)) + (vec_load_elt_rev ty flags addr offset)) + +;; Load 128-bit little-endian vector values, LE lane order - fully-reversed load. +(rule (lower (has_type (vr128_ty ty) (load flags @ (littleendian) addr offset))) + (if-let (LaneOrder.LittleEndian) (lane_order)) + (vec_load_full_rev ty flags addr offset)) + + +;; Helper to perform a 128-bit full-vector byte-reversed load. +(decl vec_load_full_rev (Type MemFlags Value Offset32) Reg) + +;; Full-vector byte-reversed load via single instruction on z15. +(rule (vec_load_full_rev (and (vxrs_ext2_enabled) (vr128_ty ty)) flags addr offset) (vec_loadrev ty (lower_address flags addr offset))) -;; Load 128-bit little-endian vector values (via GPRs on z14). -(rule (lower (has_type (and (vxrs_ext2_disabled) (vr128_ty ty)) - (load flags @ (littleendian) addr offset))) +;; Full-vector byte-reversed load via GPRs on z14. +(rule (vec_load_full_rev (and (vxrs_ext2_disabled) (vr128_ty ty)) flags addr offset) (let ((lo_addr MemArg (lower_address_bias flags addr offset 0)) (hi_addr MemArg (lower_address_bias flags addr offset 8)) (lo_val Reg (loadrev64 lo_addr)) @@ -2504,6 +2543,75 @@ (mov_to_vec128 ty hi_val lo_val))) +;; Helper to perform an element-wise byte-reversed load. +(decl vec_load_byte_rev (Type MemFlags Value Offset32) Reg) + +;; Element-wise byte-reversed 1x128-bit load is a full byte-reversed load. +(rule (vec_load_byte_rev $I128 flags addr offset) + (vec_load_full_rev $I128 flags addr offset)) + +;; Element-wise byte-reversed 16x8-bit load is a direct load. +(rule (vec_load_byte_rev ty @ (multi_lane 8 16) flags addr offset) + (vec_load ty (lower_address flags addr offset))) + +;; Element-wise byte-reversed load via single instruction on z15. +(rule (vec_load_byte_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 64 2)) + flags addr offset) + (vec_load_byte64rev ty (lower_address flags addr offset))) +(rule (vec_load_byte_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 32 4)) + flags addr offset) + (vec_load_byte32rev ty (lower_address flags addr offset))) +(rule (vec_load_byte_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 16 8)) + flags addr offset) + (vec_load_byte16rev ty (lower_address flags addr offset))) + +;; Element-wise byte-reversed load as element-swapped byte-reversed load on z14. +(rule (vec_load_byte_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 64 2)) + flags addr offset) + (vec_elt_rev ty (vec_load_full_rev ty flags addr offset))) +(rule (vec_load_byte_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 32 4)) + flags addr offset) + (vec_elt_rev ty (vec_load_full_rev ty flags addr offset))) +(rule (vec_load_byte_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 16 8)) + flags addr offset) + (vec_elt_rev ty (vec_load_full_rev ty flags addr offset))) + + +;; Helper to perform an element-reversed load. +(decl vec_load_elt_rev (Type MemFlags Value Offset32) Reg) + +;; Element-reversed 1x128-bit load is a direct load. +;; For 1x128-bit types, this is a direct load. +(rule (vec_load_elt_rev $I128 flags addr offset) + (vec_load $I128 (lower_address flags addr offset))) + +;; Element-reversed 16x8-bit load is a full byte-reversed load. +(rule (vec_load_elt_rev ty @ (multi_lane 8 16) flags addr offset) + (vec_load_full_rev ty flags addr offset)) + +;; Element-reversed load via single instruction on z15. +(rule (vec_load_elt_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 64 2)) + flags addr offset) + (vec_load_elt64rev ty (lower_address flags addr offset))) +(rule (vec_load_elt_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 32 4)) + flags addr offset) + (vec_load_elt32rev ty (lower_address flags addr offset))) +(rule (vec_load_elt_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 16 8)) + flags addr offset) + (vec_load_elt16rev ty (lower_address flags addr offset))) + +;; Element-reversed load as element-swapped direct load on z14. +(rule (vec_load_elt_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 64 2)) + flags addr offset) + (vec_elt_rev ty (vec_load ty (lower_address flags addr offset)))) +(rule (vec_load_elt_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 32 4)) + flags addr offset) + (vec_elt_rev ty (vec_load ty (lower_address flags addr offset)))) +(rule (vec_load_elt_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 16 8)) + flags addr offset) + (vec_elt_rev ty (vec_load ty (lower_address flags addr offset)))) + + ;;;; Rules for `uload8` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; 16- or 32-bit target types. @@ -2606,65 +2714,77 @@ ;;;; Rules for `uloadNxM` and `sloadNxM` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Unsigned 8->16 bit extension, big-endian source value. -(rule (lower (has_type $I16X8 (uload8x8 flags @ (bigendian) addr offset))) - (vec_unpacku_high $I8X16 - (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))) - -;; Unsigned 8->16 bit extension, little-endian source value. -(rule (lower (has_type $I16X8 (uload8x8 flags @ (littleendian) addr offset))) - (vec_unpacku_high $I8X16 - (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))) - -;; Signed 8->16 bit extension, big-endian source value. -(rule (lower (has_type $I16X8 (sload8x8 flags @ (bigendian) addr offset))) - (vec_unpacks_high $I8X16 - (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))) - -;; Signed 8->16 bit extension, little-endian source value. -(rule (lower (has_type $I16X8 (sload8x8 flags @ (littleendian) addr offset))) - (vec_unpacks_high $I8X16 - (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))) - -;; Unsigned 16->32 bit extension, big-endian source value. -(rule (lower (has_type $I32X4 (uload16x4 flags @ (bigendian) addr offset))) - (vec_unpacku_high $I16X8 - (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))) - -;; Unsigned 16->32 bit extension, little-endian source value. -(rule (lower (has_type $I32X4 (uload16x4 flags @ (littleendian) addr offset))) - (vec_unpacku_high $I16X8 - (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))) - -;; Signed 16->32 bit extension, big-endian source value. -(rule (lower (has_type $I32X4 (sload16x4 flags @ (bigendian) addr offset))) - (vec_unpacks_high $I16X8 - (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))) - -;; Signed 16->32 bit extension, little-endian source value. -(rule (lower (has_type $I32X4 (sload16x4 flags @ (littleendian) addr offset))) - (vec_unpacks_high $I16X8 - (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))) - -;; Unsigned 32->64 bit extension, big-endian source value. -(rule (lower (has_type $I64X2 (uload32x2 flags @ (bigendian) addr offset))) - (vec_unpacku_high $I32X4 - (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))) - -;; Unsigned 32->64 bit extension, little-endian source value. -(rule (lower (has_type $I64X2 (uload32x2 flags @ (littleendian) addr offset))) - (vec_unpacku_high $I32X4 - (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))) - -;; Signed 32->64 bit extension, big-endian source value. -(rule (lower (has_type $I64X2 (sload32x2 flags @ (bigendian) addr offset))) - (vec_unpacks_high $I32X4 - (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0))) - -;; Signed 32->64 bit extension, little-endian source value. -(rule (lower (has_type $I64X2 (sload32x2 flags @ (littleendian) addr offset))) - (vec_unpacks_high $I32X4 - (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0))) +;; Unsigned 8->16 bit extension. +(rule (lower (has_type $I16X8 (uload8x8 flags addr offset))) + (vec_unpacku_high $I8X16 (load_v64 $I8X16 flags addr offset))) + +;; Signed 8->16 bit extension. +(rule (lower (has_type $I16X8 (sload8x8 flags addr offset))) + (vec_unpacks_high $I8X16 (load_v64 $I8X16 flags addr offset))) + +;; Unsigned 16->32 bit extension. +(rule (lower (has_type $I32X4 (uload16x4 flags addr offset))) + (vec_unpacku_high $I16X8 (load_v64 $I16X8 flags addr offset))) + +;; Signed 16->32 bit extension. +(rule (lower (has_type $I32X4 (sload16x4 flags addr offset))) + (vec_unpacks_high $I16X8 (load_v64 $I16X8 flags addr offset))) + +;; Unsigned 32->64 bit extension. +(rule (lower (has_type $I64X2 (uload32x2 flags addr offset))) + (vec_unpacku_high $I32X4 (load_v64 $I32X4 flags addr offset))) + +;; Signed 32->64 bit extension. +(rule (lower (has_type $I64X2 (sload32x2 flags addr offset))) + (vec_unpacks_high $I32X4 (load_v64 $I32X4 flags addr offset))) + + +;; Helper to load a 64-bit half-size vector from memory. +(decl load_v64 (Type MemFlags Value Offset32) Reg) + +;; Any big-endian source value, BE lane order. +(rule (load_v64 _ flags @ (bigendian) addr offset) + (if-let (LaneOrder.BigEndian) (lane_order)) + (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)) + +;; Any little-endian source value, LE lane order. +(rule (load_v64 _ flags @ (littleendian) addr offset) + (if-let (LaneOrder.LittleEndian) (lane_order)) + (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)) + +;; Big-endian or little-endian 8x8-bit source value, BE lane order. +(rule (load_v64 (multi_lane 8 16) flags addr offset) + (if-let (LaneOrder.BigEndian) (lane_order)) + (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0)) + +;; Big-endian or little-endian 8x8-bit source value, LE lane order. +(rule (load_v64 (multi_lane 8 16) flags addr offset) + (if-let (LaneOrder.LittleEndian) (lane_order)) + (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0)) + +;; Little-endian 4x16-bit source value, BE lane order. +(rule (load_v64 (multi_lane 16 8) flags @ (littleendian) addr offset) + (if-let (LaneOrder.BigEndian) (lane_order)) + (vec_rot_imm $I16X8 + (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0) 8)) + +;; Big-endian 4x16-bit source value, LE lane order. +(rule (load_v64 (multi_lane 16 8) flags @ (bigendian) addr offset) + (if-let (LaneOrder.LittleEndian) (lane_order)) + (vec_rot_imm $I16X8 + (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0) 8)) + +;; Little-endian 2x32-bit source value, BE lane order. +(rule (load_v64 (multi_lane 32 4) flags @ (littleendian) addr offset) + (if-let (LaneOrder.BigEndian) (lane_order)) + (vec_rot_imm $I64X2 + (vec_load_lane_little_undef $I64X2 (lower_address flags addr offset) 0) 32)) + +;; Big-endian 2x32-bit source value, LE lane order. +(rule (load_v64 (multi_lane 32 4) flags @ (bigendian) addr offset) + (if-let (LaneOrder.LittleEndian) (lane_order)) + (vec_rot_imm $I64X2 + (vec_load_lane_undef $I64X2 (lower_address flags addr offset) 0) 32)) ;;;; Rules for `store` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2716,25 +2836,114 @@ (side_effect (vec_store_lane_little $F64X2 val (lower_address flags addr offset) 0))) -;; Store 128-bit big-endian vector type. +;; Store 128-bit big-endian vector type, BE lane order - direct store. (rule (lower (store flags @ (bigendian) val @ (value_type (vr128_ty ty)) addr offset)) + (if-let (LaneOrder.BigEndian) (lane_order)) (side_effect (vec_store val (lower_address flags addr offset)))) -;; Store 128-bit little-endian vector type (z15 instruction). +;; Store 128-bit little-endian vector type, BE lane order - byte-reversed store. (rule (lower (store flags @ (littleendian) - val @ (value_type (and (vr128_ty ty) (vxrs_ext2_enabled))) addr offset)) - (side_effect (vec_storerev val (lower_address flags addr offset)))) + val @ (value_type (vr128_ty ty)) addr offset)) + (if-let (LaneOrder.BigEndian) (lane_order)) + (side_effect (vec_store_byte_rev ty val flags addr offset))) -;; Store 128-bit little-endian vector type (via GPRs on z14). +;; Store 128-bit big-endian vector type, LE lane order - element-reversed store. +(rule (lower (store flags @ (bigendian) + val @ (value_type (vr128_ty ty)) addr offset)) + (if-let (LaneOrder.LittleEndian) (lane_order)) + (side_effect (vec_store_elt_rev ty val flags addr offset))) + +;; Store 128-bit little-endian vector type, LE lane order - fully-reversed store. (rule (lower (store flags @ (littleendian) - val @ (value_type (and (vr128_ty ty) (vxrs_ext2_disabled))) addr offset)) + val @ (value_type (vr128_ty ty)) addr offset)) + (if-let (LaneOrder.LittleEndian) (lane_order)) + (side_effect (vec_store_full_rev ty val flags addr offset))) + + +;; Helper to perform a 128-bit full-vector byte-reversed store. +(decl vec_store_full_rev (Type Reg MemFlags Value Offset32) SideEffectNoResult) + +;; Full-vector byte-reversed store via single instruction on z15. +(rule (vec_store_full_rev (vxrs_ext2_enabled) val flags addr offset) + (vec_storerev val (lower_address flags addr offset))) + +;; Full-vector byte-reversed store via GPRs on z14. +(rule (vec_store_full_rev (vxrs_ext2_disabled) val flags addr offset) (let ((lo_addr MemArg (lower_address_bias flags addr offset 0)) (hi_addr MemArg (lower_address_bias flags addr offset 8)) (lo_val Reg (vec_extract_lane $I64X2 val 1 (zero_reg))) (hi_val Reg (vec_extract_lane $I64X2 val 0 (zero_reg)))) - (side_effect (side_effect_concat (storerev64 lo_val lo_addr) - (storerev64 hi_val hi_addr))))) + (side_effect_concat (storerev64 lo_val lo_addr) + (storerev64 hi_val hi_addr)))) + + +;; Helper to perform an element-wise byte-reversed store. +(decl vec_store_byte_rev (Type Reg MemFlags Value Offset32) SideEffectNoResult) + +;; Element-wise byte-reversed 1x128-bit store is a full byte-reversed store. +(rule (vec_store_byte_rev $I128 val flags addr offset) + (vec_store_full_rev $I128 val flags addr offset)) + +;; Element-wise byte-reversed 16x8-bit store is a direct store. +(rule (vec_store_byte_rev (multi_lane 8 16) val flags addr offset) + (vec_store val (lower_address flags addr offset))) + +;; Element-wise byte-reversed store via single instruction on z15. +(rule (vec_store_byte_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 64 2)) + val flags addr offset) + (vec_store_byte64rev val (lower_address flags addr offset))) +(rule (vec_store_byte_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 32 4)) + val flags addr offset) + (vec_store_byte32rev val (lower_address flags addr offset))) +(rule (vec_store_byte_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 16 8)) + val flags addr offset) + (vec_store_byte16rev val (lower_address flags addr offset))) + +;; Element-wise byte-reversed load as element-swapped byte-reversed store on z14. +(rule (vec_store_byte_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 64 2)) + val flags addr offset) + (vec_store_full_rev ty (vec_elt_rev ty val) flags addr offset)) +(rule (vec_store_byte_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 32 4)) + val flags addr offset) + (vec_store_full_rev ty (vec_elt_rev ty val) flags addr offset)) +(rule (vec_store_byte_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 16 8)) + val flags addr offset) + (vec_store_full_rev ty (vec_elt_rev ty val) flags addr offset)) + + +;; Helper to perform an element-reversed store. +(decl vec_store_elt_rev (Type Reg MemFlags Value Offset32) SideEffectNoResult) + +;; Element-reversed 1x128-bit store is a direct store. +(rule (vec_store_elt_rev $I128 val flags addr offset) + (vec_store val (lower_address flags addr offset))) + +;; Element-reversed 16x8-bit store is a full byte-reversed store. +(rule (vec_store_elt_rev ty @ (multi_lane 8 16) val flags addr offset) + (vec_store_full_rev ty val flags addr offset)) + +;; Element-reversed store via single instruction on z15. +(rule (vec_store_elt_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 64 2)) + val flags addr offset) + (vec_store_elt64rev val (lower_address flags addr offset))) +(rule (vec_store_elt_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 32 4)) + val flags addr offset) + (vec_store_elt32rev val (lower_address flags addr offset))) +(rule (vec_store_elt_rev (and (vxrs_ext2_enabled) ty @ (multi_lane 16 8)) + val flags addr offset) + (vec_store_elt16rev val (lower_address flags addr offset))) + +;; Element-reversed store as element-swapped direct store on z14. +(rule (vec_store_elt_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 64 2)) + val flags addr offset) + (vec_store (vec_elt_rev ty val) (lower_address flags addr offset))) +(rule (vec_store_elt_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 32 4)) + val flags addr offset) + (vec_store (vec_elt_rev ty val) (lower_address flags addr offset))) +(rule (vec_store_elt_rev (and (vxrs_ext2_disabled) ty @ (multi_lane 16 8)) + val flags addr offset) + (vec_store (vec_elt_rev ty val) (lower_address flags addr offset))) ;;;; Rules for 8-bit integer stores ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3591,24 +3800,48 @@ ;;;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (vhigh_bits x @ (value_type (multi_lane 8 16)))) + (if-let (LaneOrder.LittleEndian) (lane_order)) (let ((mask Reg (vec_imm $I8X16 (imm8x16 0 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120)))) (vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg)))) +(rule (lower (vhigh_bits x @ (value_type (multi_lane 8 16)))) + (if-let (LaneOrder.BigEndian) (lane_order)) + (let ((mask Reg (vec_imm $I8X16 (imm8x16 120 112 104 96 88 80 72 64 + 56 48 40 32 24 16 8 0)))) + (vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg)))) (rule (lower (vhigh_bits x @ (value_type (multi_lane 16 8)))) + (if-let (LaneOrder.LittleEndian) (lane_order)) (let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128 0 16 32 48 64 80 96 112)))) (vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg)))) +(rule (lower (vhigh_bits x @ (value_type (multi_lane 16 8)))) + (if-let (LaneOrder.BigEndian) (lane_order)) + (let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128 + 112 96 80 64 48 32 16 0)))) + (vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg)))) (rule (lower (vhigh_bits x @ (value_type (multi_lane 32 4)))) + (if-let (LaneOrder.LittleEndian) (lane_order)) (let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128 128 128 128 128 0 32 64 96)))) (vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg)))) +(rule (lower (vhigh_bits x @ (value_type (multi_lane 32 4)))) + (if-let (LaneOrder.BigEndian) (lane_order)) + (let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128 + 128 128 128 128 96 64 32 0)))) + (vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg)))) (rule (lower (vhigh_bits x @ (value_type (multi_lane 64 2)))) + (if-let (LaneOrder.LittleEndian) (lane_order)) (let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128 128 128 128 128 128 128 0 64)))) (vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg)))) +(rule (lower (vhigh_bits x @ (value_type (multi_lane 64 2)))) + (if-let (LaneOrder.BigEndian) (lane_order)) + (let ((mask Reg (vec_imm $I8X16 (imm8x16 128 128 128 128 128 128 128 128 + 128 128 128 128 128 128 64 0)))) + (vec_extract_lane $I64X2 (vec_bitpermute x mask) 0 (zero_reg)))) ;;;; Rules for `is_null` and `is_invalid` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3870,7 +4103,8 @@ (decl lower_call_args_slots (ABISig Range ValueSlice) InstOutput) (rule (lower_call_args_slots abi (range_empty) _) (output_none)) (rule (lower_call_args_slots abi (range_unwrap head tail) args) - (let ((_ Unit (copy_to_arg 0 (abi_get_arg abi head) + (let ((_ Unit (copy_to_arg (abi_lane_order abi) + 0 (abi_get_arg abi head) (value_slice_get args head)))) (lower_call_args_slots abi tail args))) @@ -3886,7 +4120,9 @@ (decl lower_call_rets (ABISig Range InstOutputBuilder) InstOutput) (rule (lower_call_rets abi (range_empty) builder) (output_builder_finish builder)) (rule (lower_call_rets abi (range_unwrap head tail) builder) - (let ((ret ValueRegs (copy_from_arg (abi_sized_stack_arg_space abi) (abi_get_ret abi head))) + (let ((ret ValueRegs (copy_from_arg (abi_lane_order abi) + (abi_sized_stack_arg_space abi) + (abi_get_ret abi head))) (_ Unit (output_builder_push builder ret))) (lower_call_rets abi tail builder))) diff --git a/cranelift/codegen/src/isa/s390x/lower/isle.rs b/cranelift/codegen/src/isa/s390x/lower/isle.rs index a2bfae76f989..99b6fbdce670 100644 --- a/cranelift/codegen/src/isa/s390x/lower/isle.rs +++ b/cranelift/codegen/src/isa/s390x/lower/isle.rs @@ -6,8 +6,8 @@ pub mod generated_code; // Types that the generated ISLE code uses via `use super::*`. use crate::isa::s390x::abi::{S390xMachineDeps, REG_SAVE_AREA_SIZE}; use crate::isa::s390x::inst::{ - gpr, stack_reg, writable_gpr, zero_reg, CallIndInfo, CallInfo, Cond, Inst as MInst, MemArg, - MemArgPair, SymbolReloc, UImm12, UImm16Shifted, UImm32Shifted, + gpr, stack_reg, writable_gpr, zero_reg, CallIndInfo, CallInfo, Cond, Inst as MInst, LaneOrder, + MemArg, MemArgPair, SymbolReloc, UImm12, UImm16Shifted, UImm32Shifted, }; use crate::isa::s390x::settings::Flags as IsaFlags; use crate::machinst::isle::*; @@ -102,6 +102,10 @@ where ABISig::from_func_sig::(sig, self.flags).unwrap() } + fn abi_lane_order(&mut self, abi: &ABISig) -> LaneOrder { + lane_order_for_call_conv(abi.call_conv()) + } + fn abi_accumulate_outgoing_args_size(&mut self, abi: &ABISig) -> Unit { let off = abi.sized_stack_arg_space() + abi.sized_stack_ret_space(); self.lower_ctx @@ -405,9 +409,36 @@ where UImm16Shifted::maybe_from_u64(n) } + #[inline] + fn lane_order(&mut self) -> Option { + Some(lane_order_for_call_conv(self.lower_ctx.abi().call_conv())) + } + #[inline] fn be_lane_idx(&mut self, ty: Type, idx: u8) -> u8 { - ty.lane_count() as u8 - 1 - idx + match self.lane_order().unwrap() { + LaneOrder::LittleEndian => ty.lane_count() as u8 - 1 - idx, + LaneOrder::BigEndian => idx, + } + } + + #[inline] + fn be_vec_const(&mut self, ty: Type, n: u128) -> u128 { + match self.lane_order().unwrap() { + LaneOrder::LittleEndian => n, + LaneOrder::BigEndian => { + let lane_count = ty.lane_count(); + let lane_bits = ty.lane_bits(); + let lane_mask = (1u128 << lane_bits) - 1; + let mut n_le = n; + let mut n_be = 0u128; + for _ in 0..lane_count { + n_be = (n_be << lane_bits) | (n_le & lane_mask); + n_le = n_le >> lane_bits; + } + n_be + } + } } #[inline] @@ -419,17 +450,19 @@ where #[inline] fn shuffle_mask_from_u128(&mut self, idx: u128) -> (u128, u16) { - let bytes = idx.to_be_bytes(); + let bytes = match self.lane_order().unwrap() { + LaneOrder::LittleEndian => idx.to_be_bytes().map(|x| { + if x < 16 { + 15 - x + } else if x < 32 { + 47 - x + } else { + 128 + } + }), + LaneOrder::BigEndian => idx.to_le_bytes().map(|x| if x < 32 { x } else { 128 }), + }; let and_mask = bytes.iter().fold(0, |acc, &x| (acc << 1) | (x < 32) as u16); - let bytes = bytes.map(|x| { - if x < 16 { - 15 - x - } else if x < 32 { - 47 - x - } else { - 128 - } - }); let permute_mask = u128::from_be_bytes(bytes); (permute_mask, and_mask) } @@ -813,6 +846,16 @@ where } } +/// Lane order to be used for a given calling convention. +#[inline] +fn lane_order_for_call_conv(call_conv: CallConv) -> LaneOrder { + if call_conv.extends_wasmtime() { + LaneOrder::LittleEndian + } else { + LaneOrder::BigEndian + } +} + /// Zero-extend the low `from_bits` bits of `value` to a full u64. #[inline] fn zero_extend_to_u64(value: u64, from_bits: u8) -> u64 { diff --git a/cranelift/filetests/filetests/isa/s390x/vec-abi.clif b/cranelift/filetests/filetests/isa/s390x/vec-abi.clif new file mode 100644 index 000000000000..10bbb154b044 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-abi.clif @@ -0,0 +1,127 @@ +test compile precise-output +target s390x + +function %caller_be_to_be(i64x2, i32x4, i16x8, i8x16) -> i32x4 { + fn0 = %callee_be(i64x2, i32x4, i16x8, i8x16) -> i32x4 + +block0(v0: i64x2, v1: i32x4, v2: i16x8, v3: i8x16): + v4 = call fn0(v0, v1, v2, v3) + return v4 +} + +; stmg %r14, %r15, 112(%r15) +; aghi %r15, -160 +; virtual_sp_offset_adjust 160 +; block0: +; bras %r1, 12 ; data %callee_be + 0 ; lg %r3, 0(%r1) +; basr %r14, %r3 +; lmg %r14, %r15, 272(%r15) +; br %r14 + +function %caller_be_to_le(i64x2, i32x4, i16x8, i8x16) -> i32x4 { + fn0 = %callee_le(i64x2, i32x4, i16x8, i8x16) -> i32x4 wasmtime_system_v + +block0(v0: i64x2, v1: i32x4, v2: i16x8, v3: i8x16): + v4 = call fn0(v0, v1, v2, v3) + return v4 +} + +; stmg %r14, %r15, 112(%r15) +; aghi %r15, -224 +; virtual_sp_offset_adjust 160 +; std %f8, 160(%r15) +; std %f9, 168(%r15) +; std %f10, 176(%r15) +; std %f11, 184(%r15) +; std %f12, 192(%r15) +; std %f13, 200(%r15) +; std %f14, 208(%r15) +; std %f15, 216(%r15) +; block0: +; vpdi %v24, %v24, %v24, 4 +; vpdi %v20, %v25, %v25, 4 +; verllg %v25, %v20, 32 +; vpdi %v26, %v26, %v26, 4 +; verllg %v28, %v26, 32 +; verllf %v26, %v28, 16 +; vpdi %v0, %v27, %v27, 4 +; verllg %v2, %v0, 32 +; verllf %v4, %v2, 16 +; verllh %v27, %v4, 8 +; bras %r1, 12 ; data %callee_le + 0 ; lg %r3, 0(%r1) +; basr %r14, %r3 +; vpdi %v22, %v24, %v24, 4 +; verllg %v24, %v22, 32 +; ld %f8, 160(%r15) +; ld %f9, 168(%r15) +; ld %f10, 176(%r15) +; ld %f11, 184(%r15) +; ld %f12, 192(%r15) +; ld %f13, 200(%r15) +; ld %f14, 208(%r15) +; ld %f15, 216(%r15) +; lmg %r14, %r15, 336(%r15) +; br %r14 + +function %caller_le_to_be(i64x2, i32x4, i16x8, i8x16) -> i32x4 wasmtime_system_v { + fn0 = %callee_be(i64x2, i32x4, i16x8, i8x16) -> i32x4 + +block0(v0: i64x2, v1: i32x4, v2: i16x8, v3: i8x16): + v4 = call fn0(v0, v1, v2, v3) + return v4 +} + +; stmg %r14, %r15, 112(%r15) +; aghi %r15, -224 +; virtual_sp_offset_adjust 160 +; std %f8, 160(%r15) +; std %f9, 168(%r15) +; std %f10, 176(%r15) +; std %f11, 184(%r15) +; std %f12, 192(%r15) +; std %f13, 200(%r15) +; std %f14, 208(%r15) +; std %f15, 216(%r15) +; block0: +; vpdi %v24, %v24, %v24, 4 +; vpdi %v20, %v25, %v25, 4 +; verllg %v25, %v20, 32 +; vpdi %v26, %v26, %v26, 4 +; verllg %v28, %v26, 32 +; verllf %v26, %v28, 16 +; vpdi %v0, %v27, %v27, 4 +; verllg %v2, %v0, 32 +; verllf %v4, %v2, 16 +; verllh %v27, %v4, 8 +; bras %r1, 12 ; data %callee_be + 0 ; lg %r3, 0(%r1) +; basr %r14, %r3 +; vpdi %v22, %v24, %v24, 4 +; verllg %v24, %v22, 32 +; ld %f8, 160(%r15) +; ld %f9, 168(%r15) +; ld %f10, 176(%r15) +; ld %f11, 184(%r15) +; ld %f12, 192(%r15) +; ld %f13, 200(%r15) +; ld %f14, 208(%r15) +; ld %f15, 216(%r15) +; lmg %r14, %r15, 336(%r15) +; br %r14 + +function %caller_le_to_le(i64x2, i32x4, i16x8, i8x16) -> i32x4 wasmtime_system_v { + fn0 = %callee_le(i64x2, i32x4, i16x8, i8x16) -> i32x4 wasmtime_system_v + +block0(v0: i64x2, v1: i32x4, v2: i16x8, v3: i8x16): + v4 = call fn0(v0, v1, v2, v3) + return v4 +} + +; stmg %r14, %r15, 112(%r15) +; aghi %r15, -160 +; virtual_sp_offset_adjust 160 +; block0: +; bras %r1, 12 ; data %callee_le + 0 ; lg %r3, 0(%r1) +; basr %r14, %r3 +; lmg %r14, %r15, 272(%r15) +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-arithmetic.clif b/cranelift/filetests/filetests/isa/s390x/vec-arithmetic.clif index 334c43821b8a..4526bf54c0b2 100644 --- a/cranelift/filetests/filetests/isa/s390x/vec-arithmetic.clif +++ b/cranelift/filetests/filetests/isa/s390x/vec-arithmetic.clif @@ -457,21 +457,6 @@ block0(v0: i8x16, v1: i8x16): ; vpksh %v24, %v17, %v23 ; br %r14 -function %iadd_pairwise_i32x4(i32x4, i32x4) -> i32x4 { -block0(v0: i32x4, v1: i32x4): - v2 = iadd_pairwise.i32x4 v0, v1 - return v2 -} - -; block0: -; vrepib %v5, 32 -; vsrlb %v7, %v25, %v5 -; vaf %v17, %v25, %v7 -; vsrlb %v19, %v24, %v5 -; vaf %v21, %v24, %v19 -; vpkg %v24, %v17, %v21 -; br %r14 - function %usub_sat64x2(i64x2, i64x2) -> i64x2 { block0(v0: i64x2, v1: i64x2): v2 = usub_sat.i64x2 v0, v1 @@ -568,7 +553,7 @@ block0(v0: i8x16, v1: i8x16): ; vpksh %v24, %v17, %v23 ; br %r14 -function %iadd_pairwise_i32x4(i32x4, i32x4) -> i32x4 { +function %iadd_pairwise_i32x4_be(i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4): v2 = iadd_pairwise.i32x4 v0, v1 return v2 @@ -576,14 +561,14 @@ block0(v0: i32x4, v1: i32x4): ; block0: ; vrepib %v5, 32 -; vsrlb %v7, %v25, %v5 -; vaf %v17, %v25, %v7 -; vsrlb %v19, %v24, %v5 -; vaf %v21, %v24, %v19 +; vsrlb %v7, %v24, %v5 +; vaf %v17, %v24, %v7 +; vsrlb %v19, %v25, %v5 +; vaf %v21, %v25, %v19 ; vpkg %v24, %v17, %v21 ; br %r14 -function %iadd_pairwise_i16x8(i16x8, i16x8) -> i16x8 { +function %iadd_pairwise_i16x8_be(i16x8, i16x8) -> i16x8 { block0(v0: i16x8, v1: i16x8): v2 = iadd_pairwise.i16x8 v0, v1 return v2 @@ -591,14 +576,14 @@ block0(v0: i16x8, v1: i16x8): ; block0: ; vrepib %v5, 16 -; vsrlb %v7, %v25, %v5 -; vah %v17, %v25, %v7 -; vsrlb %v19, %v24, %v5 -; vah %v21, %v24, %v19 +; vsrlb %v7, %v24, %v5 +; vah %v17, %v24, %v7 +; vsrlb %v19, %v25, %v5 +; vah %v21, %v25, %v19 ; vpkf %v24, %v17, %v21 ; br %r14 -function %iadd_pairwise_i8x16(i8x16, i8x16) -> i8x16 { +function %iadd_pairwise_i8x16_be(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): v2 = iadd_pairwise.i8x16 v0, v1 return v2 @@ -606,13 +591,58 @@ block0(v0: i8x16, v1: i8x16): ; block0: ; vrepib %v5, 8 -; vsrlb %v7, %v25, %v5 -; vab %v17, %v25, %v7 -; vsrlb %v19, %v24, %v5 -; vab %v21, %v24, %v19 +; vsrlb %v7, %v24, %v5 +; vab %v17, %v24, %v7 +; vsrlb %v19, %v25, %v5 +; vab %v21, %v25, %v19 ; vpkh %v24, %v17, %v21 ; br %r14 +function %iadd_pairwise_i32x4_le(i32x4, i32x4) -> i32x4 wasmtime_system_v { +block0(v0: i32x4, v1: i32x4): + v2 = iadd_pairwise.i32x4 v0, v1 + return v2 +} + +; block0: +; vrepib %v5, 32 +; vsrlb %v7, %v24, %v5 +; vaf %v17, %v24, %v7 +; vsrlb %v19, %v25, %v5 +; vaf %v21, %v25, %v19 +; vpkg %v24, %v21, %v17 +; br %r14 + +function %iadd_pairwise_i16x8_le(i16x8, i16x8) -> i16x8 wasmtime_system_v { +block0(v0: i16x8, v1: i16x8): + v2 = iadd_pairwise.i16x8 v0, v1 + return v2 +} + +; block0: +; vrepib %v5, 16 +; vsrlb %v7, %v24, %v5 +; vah %v17, %v24, %v7 +; vsrlb %v19, %v25, %v5 +; vah %v21, %v25, %v19 +; vpkf %v24, %v21, %v17 +; br %r14 + +function %iadd_pairwise_i8x16_le(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = iadd_pairwise.i8x16 v0, v1 + return v2 +} + +; block0: +; vrepib %v5, 8 +; vsrlb %v7, %v24, %v5 +; vab %v17, %v24, %v7 +; vsrlb %v19, %v25, %v5 +; vab %v21, %v25, %v19 +; vpkh %v24, %v21, %v17 +; br %r14 + function %imul_i64x2(i64x2, i64x2) -> i64x2 { block0(v0: i64x2, v1: i64x2): v2 = imul.i64x2 v0, v1 diff --git a/cranelift/filetests/filetests/isa/s390x/vec-constants-le-lane.clif b/cranelift/filetests/filetests/isa/s390x/vec-constants-le-lane.clif new file mode 100644 index 000000000000..0d99f426c942 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-constants-le-lane.clif @@ -0,0 +1,213 @@ +test compile precise-output +target s390x + +function %vconst_i64x2_zero() -> i64x2 wasmtime_system_v { +block0: + v1 = vconst.i64x2 [0 0] + return v1 +} + +; block0: +; vgbm %v24, 0 +; br %r14 + +function %vconst_i64x2_splat1() -> i64x2 wasmtime_system_v { +block0: + v1 = vconst.i64x2 [32767 32767] + return v1 +} + +; block0: +; vrepig %v24, 32767 +; br %r14 + +function %vconst_i64x2_splat2() -> i64x2 wasmtime_system_v { +block0: + v1 = vconst.i64x2 [-32768 -32768] + return v1 +} + +; block0: +; vrepig %v24, -32768 +; br %r14 + +function %vconst_i64x2_splat3() -> i64x2 wasmtime_system_v { +block0: + v1 = vconst.i64x2 [32768 32768] + return v1 +} + +; block0: +; bras %r1, 12 ; data.u64 0x0000000000008000 ; vlrepg %v24, 0(%r1) +; br %r14 + +function %vconst_i64x2_splat4() -> i64x2 wasmtime_system_v { +block0: + v1 = vconst.i64x2 [-32769 -32769] + return v1 +} + +; block0: +; bras %r1, 12 ; data.u64 0xffffffffffff7fff ; vlrepg %v24, 0(%r1) +; br %r14 + +function %vconst_i64x2_mixed() -> i64x2 wasmtime_system_v { +block0: + v1 = vconst.i64x2 [1 2] + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x00000000000000020000000000000001 ; vl %v24, 0(%r1) +; br %r14 + +function %vconst_i32x4_zero() -> i32x4 wasmtime_system_v { +block0: + v1 = vconst.i32x4 [0 0 0 0] + return v1 +} + +; block0: +; vgbm %v24, 0 +; br %r14 + +function %vconst_i32x4_splat1() -> i32x4 wasmtime_system_v { +block0: + v1 = vconst.i32x4 [32767 32767 32767 32767] + return v1 +} + +; block0: +; vrepif %v24, 32767 +; br %r14 + +function %vconst_i32x4_splat2() -> i32x4 wasmtime_system_v { +block0: + v1 = vconst.i32x4 [-32768 -32768 -32768 -32768] + return v1 +} + +; block0: +; vrepif %v24, -32768 +; br %r14 + +function %vconst_i32x4_splat3() -> i32x4 wasmtime_system_v { +block0: + v1 = vconst.i32x4 [32768 32768 32768 32768] + return v1 +} + +; block0: +; bras %r1, 8 ; data.u32 0x00008000 ; vlrepf %v24, 0(%r1) +; br %r14 + +function %vconst_i32x4_splat4() -> i32x4 wasmtime_system_v { +block0: + v1 = vconst.i32x4 [-32769 -32769 -32769 -32769] + return v1 +} + +; block0: +; bras %r1, 8 ; data.u32 0xffff7fff ; vlrepf %v24, 0(%r1) +; br %r14 + +function %vconst_i32x4_splat_i64() -> i32x4 wasmtime_system_v { +block0: + v1 = vconst.i32x4 [1 2 1 2] + return v1 +} + +; block0: +; bras %r1, 12 ; data.u64 0x0000000200000001 ; vlrepg %v24, 0(%r1) +; br %r14 + +function %vconst_i32x4_mixed() -> i32x4 wasmtime_system_v { +block0: + v1 = vconst.i32x4 [1 2 3 4] + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x00000004000000030000000200000001 ; vl %v24, 0(%r1) +; br %r14 + +function %vconst_i16x8_zero() -> i16x8 wasmtime_system_v { +block0: + v1 = vconst.i16x8 [0 0 0 0 0 0 0 0] + return v1 +} + +; block0: +; vgbm %v24, 0 +; br %r14 + +function %vconst_i16x8_splat1() -> i16x8 wasmtime_system_v { +block0: + v1 = vconst.i16x8 [32767 32767 32767 32767 32767 32767 32767 32767] + return v1 +} + +; block0: +; vrepih %v24, 32767 +; br %r14 + +function %vconst_i16x8_splat2() -> i16x8 wasmtime_system_v { +block0: + v1 = vconst.i16x8 [-32768 -32768 -32768 -32768 -32768 -32768 -32768 -32768] + return v1 +} + +; block0: +; vrepih %v24, -32768 +; br %r14 + +function %vconst_i16x8_mixed() -> i16x8 wasmtime_system_v { +block0: + v1 = vconst.i16x8 [1 2 3 4 5 6 7 8] + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x00080007000600050004000300020001 ; vl %v24, 0(%r1) +; br %r14 + +function %vconst_i8x16_zero() -> i8x16 wasmtime_system_v { +block0: + v1 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + return v1 +} + +; block0: +; vgbm %v24, 0 +; br %r14 + +function %vconst_i8x16_splat1() -> i8x16 wasmtime_system_v { +block0: + v1 = vconst.i8x16 [127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127] + return v1 +} + +; block0: +; vrepib %v24, 127 +; br %r14 + +function %vconst_i8x16_splat2() -> i8x16 wasmtime_system_v { +block0: + v1 = vconst.i8x16 [-128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128] + return v1 +} + +; block0: +; vrepib %v24, 128 +; br %r14 + +function %vconst_i8x16_mixed() -> i8x16 wasmtime_system_v { +block0: + v1 = vconst.i8x16 [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x100f0e0d0c0b0a090807060504030201 ; vl %v24, 0(%r1) +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-constants.clif b/cranelift/filetests/filetests/isa/s390x/vec-constants.clif index b5a6969f2b3e..31858dafe3da 100644 --- a/cranelift/filetests/filetests/isa/s390x/vec-constants.clif +++ b/cranelift/filetests/filetests/isa/s390x/vec-constants.clif @@ -58,7 +58,7 @@ block0: } ; block0: -; bras %r1, 20 ; data.u128 0x00000000000000020000000000000001 ; vl %v24, 0(%r1) +; bras %r1, 20 ; data.u128 0x00000000000000010000000000000002 ; vl %v24, 0(%r1) ; br %r14 function %vconst_i32x4_zero() -> i32x4 { @@ -118,7 +118,7 @@ block0: } ; block0: -; bras %r1, 12 ; data.u64 0x0000000200000001 ; vlrepg %v24, 0(%r1) +; bras %r1, 12 ; data.u64 0x0000000100000002 ; vlrepg %v24, 0(%r1) ; br %r14 function %vconst_i32x4_mixed() -> i32x4 { @@ -128,7 +128,7 @@ block0: } ; block0: -; bras %r1, 20 ; data.u128 0x00000004000000030000000200000001 ; vl %v24, 0(%r1) +; bras %r1, 20 ; data.u128 0x00000001000000020000000300000004 ; vl %v24, 0(%r1) ; br %r14 function %vconst_i16x8_zero() -> i16x8 { @@ -168,7 +168,7 @@ block0: } ; block0: -; bras %r1, 20 ; data.u128 0x00080007000600050004000300020001 ; vl %v24, 0(%r1) +; bras %r1, 20 ; data.u128 0x00010002000300040005000600070008 ; vl %v24, 0(%r1) ; br %r14 function %vconst_i8x16_zero() -> i8x16 { @@ -208,6 +208,6 @@ block0: } ; block0: -; bras %r1, 20 ; data.u128 0x100f0e0d0c0b0a090807060504030201 ; vl %v24, 0(%r1) +; bras %r1, 20 ; data.u128 0x0102030405060708090a0b0c0d0e0f10 ; vl %v24, 0(%r1) ; br %r14 diff --git a/cranelift/filetests/filetests/isa/s390x/vec-conversions-le-lane.clif b/cranelift/filetests/filetests/isa/s390x/vec-conversions-le-lane.clif new file mode 100644 index 000000000000..032f4c16b171 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-conversions-le-lane.clif @@ -0,0 +1,222 @@ +test compile precise-output +target s390x + +function %snarrow_i64x2_i32x4(i64x2, i64x2) -> i32x4 wasmtime_system_v { +block0(v0: i64x2, v1: i64x2): + v2 = snarrow.i64x2 v0, v1 + return v2 +} + +; block0: +; vpksg %v24, %v25, %v24 +; br %r14 + +function %snarrow_i32x4_i16x8(i32x4, i32x4) -> i16x8 wasmtime_system_v { +block0(v0: i32x4, v1: i32x4): + v2 = snarrow.i32x4 v0, v1 + return v2 +} + +; block0: +; vpksf %v24, %v25, %v24 +; br %r14 + +function %snarrow_i16x8_i8x16(i16x8, i16x8) -> i8x16 wasmtime_system_v { +block0(v0: i16x8, v1: i16x8): + v2 = snarrow.i16x8 v0, v1 + return v2 +} + +; block0: +; vpksh %v24, %v25, %v24 +; br %r14 + +function %unarrow_i64x2_i32x4(i64x2, i64x2) -> i32x4 wasmtime_system_v { +block0(v0: i64x2, v1: i64x2): + v2 = unarrow.i64x2 v0, v1 + return v2 +} + +; block0: +; vgbm %v5, 0 +; vmxg %v7, %v24, %v5 +; vmxg %v17, %v25, %v5 +; vpklsg %v24, %v17, %v7 +; br %r14 + +function %unarrow_i32x4_i16x8(i32x4, i32x4) -> i16x8 wasmtime_system_v { +block0(v0: i32x4, v1: i32x4): + v2 = unarrow.i32x4 v0, v1 + return v2 +} + +; block0: +; vgbm %v5, 0 +; vmxf %v7, %v24, %v5 +; vmxf %v17, %v25, %v5 +; vpklsf %v24, %v17, %v7 +; br %r14 + +function %unarrow_i16x8_i8x16(i16x8, i16x8) -> i8x16 wasmtime_system_v { +block0(v0: i16x8, v1: i16x8): + v2 = unarrow.i16x8 v0, v1 + return v2 +} + +; block0: +; vgbm %v5, 0 +; vmxh %v7, %v24, %v5 +; vmxh %v17, %v25, %v5 +; vpklsh %v24, %v17, %v7 +; br %r14 + +function %uunarrow_i64x2_i32x4(i64x2, i64x2) -> i32x4 wasmtime_system_v { +block0(v0: i64x2, v1: i64x2): + v2 = uunarrow.i64x2 v0, v1 + return v2 +} + +; block0: +; vpklsg %v24, %v25, %v24 +; br %r14 + +function %uunarrow_i32x4_i16x8(i32x4, i32x4) -> i16x8 wasmtime_system_v { +block0(v0: i32x4, v1: i32x4): + v2 = uunarrow.i32x4 v0, v1 + return v2 +} + +; block0: +; vpklsf %v24, %v25, %v24 +; br %r14 + +function %uunarrow_i16x8_i8x16(i16x8, i16x8) -> i8x16 wasmtime_system_v { +block0(v0: i16x8, v1: i16x8): + v2 = uunarrow.i16x8 v0, v1 + return v2 +} + +; block0: +; vpklsh %v24, %v25, %v24 +; br %r14 + +function %swiden_low_i32x4_i64x2(i32x4) -> i64x2 wasmtime_system_v { +block0(v0: i32x4): + v1 = swiden_low.i32x4 v0 + return v1 +} + +; block0: +; vuplf %v24, %v24 +; br %r14 + +function %swiden_low_i16x8_i32x4(i16x8) -> i32x4 wasmtime_system_v { +block0(v0: i16x8): + v1 = swiden_low.i16x8 v0 + return v1 +} + +; block0: +; vuplh %v24, %v24 +; br %r14 + +function %swiden_low_i8x16_i16x8(i8x16) -> i16x8 wasmtime_system_v { +block0(v0: i8x16): + v1 = swiden_low.i8x16 v0 + return v1 +} + +; block0: +; vuplb %v24, %v24 +; br %r14 + +function %swiden_high_i32x4_i64x2(i32x4) -> i64x2 wasmtime_system_v { +block0(v0: i32x4): + v1 = swiden_high.i32x4 v0 + return v1 +} + +; block0: +; vuphf %v24, %v24 +; br %r14 + +function %swiden_high_i16x8_i32x4(i16x8) -> i32x4 wasmtime_system_v { +block0(v0: i16x8): + v1 = swiden_high.i16x8 v0 + return v1 +} + +; block0: +; vuphh %v24, %v24 +; br %r14 + +function %swiden_high_i8x16_i16x8(i8x16) -> i16x8 wasmtime_system_v { +block0(v0: i8x16): + v1 = swiden_high.i8x16 v0 + return v1 +} + +; block0: +; vuphb %v24, %v24 +; br %r14 + +function %uwiden_low_i32x4_i64x2(i32x4) -> i64x2 wasmtime_system_v { +block0(v0: i32x4): + v1 = uwiden_low.i32x4 v0 + return v1 +} + +; block0: +; vupllf %v24, %v24 +; br %r14 + +function %uwiden_low_i16x8_i32x4(i16x8) -> i32x4 wasmtime_system_v { +block0(v0: i16x8): + v1 = uwiden_low.i16x8 v0 + return v1 +} + +; block0: +; vupllh %v24, %v24 +; br %r14 + +function %uwiden_low_i8x16_i16x8(i8x16) -> i16x8 wasmtime_system_v { +block0(v0: i8x16): + v1 = uwiden_low.i8x16 v0 + return v1 +} + +; block0: +; vupllb %v24, %v24 +; br %r14 + +function %uwiden_high_i32x4_i64x2(i32x4) -> i64x2 wasmtime_system_v { +block0(v0: i32x4): + v1 = uwiden_high.i32x4 v0 + return v1 +} + +; block0: +; vuplhf %v24, %v24 +; br %r14 + +function %uwiden_high_i16x8_i32x4(i16x8) -> i32x4 wasmtime_system_v { +block0(v0: i16x8): + v1 = uwiden_high.i16x8 v0 + return v1 +} + +; block0: +; vuplhh %v24, %v24 +; br %r14 + +function %uwiden_high_i8x16_i16x8(i8x16) -> i16x8 wasmtime_system_v { +block0(v0: i8x16): + v1 = uwiden_high.i8x16 v0 + return v1 +} + +; block0: +; vuplhb %v24, %v24 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-conversions.clif b/cranelift/filetests/filetests/isa/s390x/vec-conversions.clif index b137c8cd214b..edcfcb9b17be 100644 --- a/cranelift/filetests/filetests/isa/s390x/vec-conversions.clif +++ b/cranelift/filetests/filetests/isa/s390x/vec-conversions.clif @@ -8,7 +8,7 @@ block0(v0: i64x2, v1: i64x2): } ; block0: -; vpksg %v24, %v25, %v24 +; vpksg %v24, %v24, %v25 ; br %r14 function %snarrow_i32x4_i16x8(i32x4, i32x4) -> i16x8 { @@ -18,7 +18,7 @@ block0(v0: i32x4, v1: i32x4): } ; block0: -; vpksf %v24, %v25, %v24 +; vpksf %v24, %v24, %v25 ; br %r14 function %snarrow_i16x8_i8x16(i16x8, i16x8) -> i8x16 { @@ -28,7 +28,7 @@ block0(v0: i16x8, v1: i16x8): } ; block0: -; vpksh %v24, %v25, %v24 +; vpksh %v24, %v24, %v25 ; br %r14 function %unarrow_i64x2_i32x4(i64x2, i64x2) -> i32x4 { @@ -39,8 +39,8 @@ block0(v0: i64x2, v1: i64x2): ; block0: ; vgbm %v5, 0 -; vmxg %v7, %v25, %v5 -; vmxg %v17, %v24, %v5 +; vmxg %v7, %v24, %v5 +; vmxg %v17, %v25, %v5 ; vpklsg %v24, %v7, %v17 ; br %r14 @@ -52,8 +52,8 @@ block0(v0: i32x4, v1: i32x4): ; block0: ; vgbm %v5, 0 -; vmxf %v7, %v25, %v5 -; vmxf %v17, %v24, %v5 +; vmxf %v7, %v24, %v5 +; vmxf %v17, %v25, %v5 ; vpklsf %v24, %v7, %v17 ; br %r14 @@ -65,8 +65,8 @@ block0(v0: i16x8, v1: i16x8): ; block0: ; vgbm %v5, 0 -; vmxh %v7, %v25, %v5 -; vmxh %v17, %v24, %v5 +; vmxh %v7, %v24, %v5 +; vmxh %v17, %v25, %v5 ; vpklsh %v24, %v7, %v17 ; br %r14 @@ -77,7 +77,7 @@ block0(v0: i64x2, v1: i64x2): } ; block0: -; vpklsg %v24, %v25, %v24 +; vpklsg %v24, %v24, %v25 ; br %r14 function %uunarrow_i32x4_i16x8(i32x4, i32x4) -> i16x8 { @@ -87,7 +87,7 @@ block0(v0: i32x4, v1: i32x4): } ; block0: -; vpklsf %v24, %v25, %v24 +; vpklsf %v24, %v24, %v25 ; br %r14 function %uunarrow_i16x8_i8x16(i16x8, i16x8) -> i8x16 { @@ -97,7 +97,7 @@ block0(v0: i16x8, v1: i16x8): } ; block0: -; vpklsh %v24, %v25, %v24 +; vpklsh %v24, %v24, %v25 ; br %r14 function %swiden_low_i32x4_i64x2(i32x4) -> i64x2 { @@ -107,7 +107,7 @@ block0(v0: i32x4): } ; block0: -; vuplf %v24, %v24 +; vuphf %v24, %v24 ; br %r14 function %swiden_low_i16x8_i32x4(i16x8) -> i32x4 { @@ -117,7 +117,7 @@ block0(v0: i16x8): } ; block0: -; vuplh %v24, %v24 +; vuphh %v24, %v24 ; br %r14 function %swiden_low_i8x16_i16x8(i8x16) -> i16x8 { @@ -127,7 +127,7 @@ block0(v0: i8x16): } ; block0: -; vuplb %v24, %v24 +; vuphb %v24, %v24 ; br %r14 function %swiden_high_i32x4_i64x2(i32x4) -> i64x2 { @@ -137,7 +137,7 @@ block0(v0: i32x4): } ; block0: -; vuphf %v24, %v24 +; vuplf %v24, %v24 ; br %r14 function %swiden_high_i16x8_i32x4(i16x8) -> i32x4 { @@ -147,7 +147,7 @@ block0(v0: i16x8): } ; block0: -; vuphh %v24, %v24 +; vuplh %v24, %v24 ; br %r14 function %swiden_high_i8x16_i16x8(i8x16) -> i16x8 { @@ -157,7 +157,7 @@ block0(v0: i8x16): } ; block0: -; vuphb %v24, %v24 +; vuplb %v24, %v24 ; br %r14 function %uwiden_low_i32x4_i64x2(i32x4) -> i64x2 { @@ -167,7 +167,7 @@ block0(v0: i32x4): } ; block0: -; vupllf %v24, %v24 +; vuplhf %v24, %v24 ; br %r14 function %uwiden_low_i16x8_i32x4(i16x8) -> i32x4 { @@ -177,7 +177,7 @@ block0(v0: i16x8): } ; block0: -; vupllh %v24, %v24 +; vuplhh %v24, %v24 ; br %r14 function %uwiden_low_i8x16_i16x8(i8x16) -> i16x8 { @@ -187,7 +187,7 @@ block0(v0: i8x16): } ; block0: -; vupllb %v24, %v24 +; vuplhb %v24, %v24 ; br %r14 function %uwiden_high_i32x4_i64x2(i32x4) -> i64x2 { @@ -197,7 +197,7 @@ block0(v0: i32x4): } ; block0: -; vuplhf %v24, %v24 +; vupllf %v24, %v24 ; br %r14 function %uwiden_high_i16x8_i32x4(i16x8) -> i32x4 { @@ -207,7 +207,7 @@ block0(v0: i16x8): } ; block0: -; vuplhh %v24, %v24 +; vupllh %v24, %v24 ; br %r14 function %uwiden_high_i8x16_i16x8(i8x16) -> i16x8 { @@ -217,6 +217,6 @@ block0(v0: i8x16): } ; block0: -; vuplhb %v24, %v24 +; vupllb %v24, %v24 ; br %r14 diff --git a/cranelift/filetests/filetests/isa/s390x/vec-fp.clif b/cranelift/filetests/filetests/isa/s390x/vec-fp.clif index fc356d57a762..2a818596f282 100644 --- a/cranelift/filetests/filetests/isa/s390x/vec-fp.clif +++ b/cranelift/filetests/filetests/isa/s390x/vec-fp.clif @@ -21,7 +21,17 @@ block0: ; vgbm %v24, 0 ; br %r14 -function %vconst_f32x4_mixed() -> f32x4 { +function %vconst_f32x4_mixed_be() -> f32x4 { +block0: + v1 = vconst.f32x4 [0x1.0 0x2.0 0x3.0 0x4.0] + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x3f800000400000004040000040800000 ; vl %v24, 0(%r1) +; br %r14 + +function %vconst_f32x4_mixed_le() -> f32x4 wasmtime_system_v { block0: v1 = vconst.f32x4 [0x1.0 0x2.0 0x3.0 0x4.0] return v1 @@ -31,7 +41,17 @@ block0: ; bras %r1, 20 ; data.u128 0x4080000040400000400000003f800000 ; vl %v24, 0(%r1) ; br %r14 -function %vconst_f64x2_mixed() -> f64x2 { +function %vconst_f64x2_mixed_be() -> f64x2 { +block0: + v1 = vconst.f64x2 [0x1.0 0x2.0] + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x3ff00000000000004000000000000000 ; vl %v24, 0(%r1) +; br %r14 + +function %vconst_f64x2_mixed_le() -> f64x2 wasmtime_system_v { block0: v1 = vconst.f64x2 [0x1.0 0x2.0] return v1 @@ -261,7 +281,18 @@ block0(v0: f64x2): ; vflcdb %v24, %v24 ; br %r14 -function %fvpromote_low_f32x4(f32x4) -> f64x2 { +function %fvpromote_low_f32x4_be(f32x4) -> f64x2 { +block0(v0: f32x4): + v1 = fvpromote_low v0 + return v1 +} + +; block0: +; vmrhf %v3, %v24, %v24 +; vldeb %v24, %v3 +; br %r14 + +function %fvpromote_low_f32x4_le(f32x4) -> f64x2 wasmtime_system_v { block0(v0: f32x4): v1 = fvpromote_low v0 return v1 @@ -272,7 +303,7 @@ block0(v0: f32x4): ; vldeb %v24, %v3 ; br %r14 -function %fvdemote_f64x2(f64x2) -> f32x4 { +function %fvdemote_f64x2_be(f64x2) -> f32x4 { block0(v0: f64x2): v1 = fvdemote v0 return v1 @@ -280,9 +311,22 @@ block0(v0: f64x2): ; block0: ; vledb %v3, %v24, 0, 0 -; vgbm %v5, 0 -; bras %r1, 20 ; data.u128 0x10101010101010100001020308090a0b ; vl %v7, 0(%r1) -; vperm %v24, %v3, %v5, %v7 +; vesrlg %v5, %v3, 32 +; vgbm %v7, 0 +; vpkg %v24, %v5, %v7 +; br %r14 + +function %fvdemote_f64x2_le(f64x2) -> f32x4 wasmtime_system_v { +block0(v0: f64x2): + v1 = fvdemote v0 + return v1 +} + +; block0: +; vledb %v3, %v24, 0, 0 +; vesrlg %v5, %v3, 32 +; vgbm %v7, 0 +; vpkg %v24, %v7, %v5 ; br %r14 function %ceil_f32x4(f32x4) -> f32x4 { @@ -462,7 +506,18 @@ block0(v0: i64x2): ; br %r14 -function %fcvt_low_from_sint_i32x4_f64x2(i32x4) -> f64x2 { +function %fcvt_low_from_sint_i32x4_f64x2_be(i32x4) -> f64x2 { +block0(v0: i32x4): + v1 = fcvt_low_from_sint.f64x2 v0 + return v1 +} + +; block0: +; vuphf %v3, %v24 +; vcdgb %v24, %v3, 0, 4 +; br %r14 + +function %fcvt_low_from_sint_i32x4_f64x2_le(i32x4) -> f64x2 wasmtime_system_v { block0(v0: i32x4): v1 = fcvt_low_from_sint.f64x2 v0 return v1 diff --git a/cranelift/filetests/filetests/isa/s390x/vec-lane-arch13.clif b/cranelift/filetests/filetests/isa/s390x/vec-lane-arch13.clif index 5ee1ef906fa6..9fc1406e31b3 100644 --- a/cranelift/filetests/filetests/isa/s390x/vec-lane-arch13.clif +++ b/cranelift/filetests/filetests/isa/s390x/vec-lane-arch13.clif @@ -9,7 +9,7 @@ block0(v0: i64x2, v1: i64): } ; block0: -; vleg %v24, 0(%r2), 1 +; vleg %v24, 0(%r2), 0 ; br %r14 function %insertlane_i64x2_mem_1(i64x2, i64) -> i64x2 { @@ -20,7 +20,7 @@ block0(v0: i64x2, v1: i64): } ; block0: -; vleg %v24, 0(%r2), 0 +; vleg %v24, 0(%r2), 1 ; br %r14 function %insertlane_i64x2_mem_little_0(i64x2, i64) -> i64x2 { @@ -31,7 +31,7 @@ block0(v0: i64x2, v1: i64): } ; block0: -; vlebrg %v24, 0(%r2), 1 +; vlebrg %v24, 0(%r2), 0 ; br %r14 function %insertlane_i64x2_mem_little_1(i64x2, i64) -> i64x2 { @@ -42,7 +42,7 @@ block0(v0: i64x2, v1: i64): } ; block0: -; vlebrg %v24, 0(%r2), 0 +; vlebrg %v24, 0(%r2), 1 ; br %r14 function %insertlane_i32x4_mem_0(i32x4, i64) -> i32x4 { @@ -53,7 +53,7 @@ block0(v0: i32x4, v1: i64): } ; block0: -; vlef %v24, 0(%r2), 3 +; vlef %v24, 0(%r2), 0 ; br %r14 function %insertlane_i32x4_mem_3(i32x4, i64) -> i32x4 { @@ -64,7 +64,7 @@ block0(v0: i32x4, v1: i64): } ; block0: -; vlef %v24, 0(%r2), 0 +; vlef %v24, 0(%r2), 3 ; br %r14 function %insertlane_i32x4_mem_little_0(i32x4, i64) -> i32x4 { @@ -75,7 +75,7 @@ block0(v0: i32x4, v1: i64): } ; block0: -; vlebrf %v24, 0(%r2), 3 +; vlebrf %v24, 0(%r2), 0 ; br %r14 function %insertlane_i32x4_mem_little_3(i32x4, i64) -> i32x4 { @@ -86,7 +86,7 @@ block0(v0: i32x4, v1: i64): } ; block0: -; vlebrf %v24, 0(%r2), 0 +; vlebrf %v24, 0(%r2), 3 ; br %r14 function %insertlane_i16x8_mem_0(i16x8, i64) -> i16x8 { @@ -97,7 +97,7 @@ block0(v0: i16x8, v1: i64): } ; block0: -; vleh %v24, 0(%r2), 7 +; vleh %v24, 0(%r2), 0 ; br %r14 function %insertlane_i16x8_mem_7(i16x8, i64) -> i16x8 { @@ -108,7 +108,7 @@ block0(v0: i16x8, v1: i64): } ; block0: -; vleh %v24, 0(%r2), 0 +; vleh %v24, 0(%r2), 7 ; br %r14 function %insertlane_i16x8_mem_little_0(i16x8, i64) -> i16x8 { @@ -119,7 +119,7 @@ block0(v0: i16x8, v1: i64): } ; block0: -; vlebrh %v24, 0(%r2), 7 +; vlebrh %v24, 0(%r2), 0 ; br %r14 function %insertlane_i16x8_mem_little_7(i16x8, i64) -> i16x8 { @@ -130,7 +130,7 @@ block0(v0: i16x8, v1: i64): } ; block0: -; vlebrh %v24, 0(%r2), 0 +; vlebrh %v24, 0(%r2), 7 ; br %r14 function %insertlane_i8x16_mem_0(i8x16, i64) -> i8x16 { @@ -141,7 +141,7 @@ block0(v0: i8x16, v1: i64): } ; block0: -; vleb %v24, 0(%r2), 15 +; vleb %v24, 0(%r2), 0 ; br %r14 function %insertlane_i8x16_mem_15(i8x16, i64) -> i8x16 { @@ -152,7 +152,7 @@ block0(v0: i8x16, v1: i64): } ; block0: -; vleb %v24, 0(%r2), 0 +; vleb %v24, 0(%r2), 15 ; br %r14 function %insertlane_i8x16_mem_little_0(i8x16, i64) -> i8x16 { @@ -163,7 +163,7 @@ block0(v0: i8x16, v1: i64): } ; block0: -; vleb %v24, 0(%r2), 15 +; vleb %v24, 0(%r2), 0 ; br %r14 function %insertlane_i8x16_mem_little_15(i8x16, i64) -> i8x16 { @@ -174,7 +174,7 @@ block0(v0: i8x16, v1: i64): } ; block0: -; vleb %v24, 0(%r2), 0 +; vleb %v24, 0(%r2), 15 ; br %r14 function %insertlane_f64x2_mem_0(f64x2, i64) -> f64x2 { @@ -185,7 +185,7 @@ block0(v0: f64x2, v1: i64): } ; block0: -; vleg %v24, 0(%r2), 1 +; vleg %v24, 0(%r2), 0 ; br %r14 function %insertlane_f64x2_mem_1(f64x2, i64) -> f64x2 { @@ -196,7 +196,7 @@ block0(v0: f64x2, v1: i64): } ; block0: -; vleg %v24, 0(%r2), 0 +; vleg %v24, 0(%r2), 1 ; br %r14 function %insertlane_f64x2_mem_little_0(f64x2, i64) -> f64x2 { @@ -207,7 +207,7 @@ block0(v0: f64x2, v1: i64): } ; block0: -; vlebrg %v24, 0(%r2), 1 +; vlebrg %v24, 0(%r2), 0 ; br %r14 function %insertlane_f64x2_mem_little_1(f64x2, i64) -> f64x2 { @@ -218,7 +218,7 @@ block0(v0: f64x2, v1: i64): } ; block0: -; vlebrg %v24, 0(%r2), 0 +; vlebrg %v24, 0(%r2), 1 ; br %r14 function %insertlane_f32x4_mem_0(f32x4, i64) -> f32x4 { @@ -229,7 +229,7 @@ block0(v0: f32x4, v1: i64): } ; block0: -; vlef %v24, 0(%r2), 3 +; vlef %v24, 0(%r2), 0 ; br %r14 function %insertlane_i32x4_mem_3(i32x4, i64) -> i32x4 { @@ -240,7 +240,7 @@ block0(v0: i32x4, v1: i64): } ; block0: -; vlef %v24, 0(%r2), 0 +; vlef %v24, 0(%r2), 3 ; br %r14 function %insertlane_f32x4_mem_little_0(f32x4, i64) -> f32x4 { @@ -251,7 +251,7 @@ block0(v0: f32x4, v1: i64): } ; block0: -; vlebrf %v24, 0(%r2), 3 +; vlebrf %v24, 0(%r2), 0 ; br %r14 function %insertlane_i32x4_mem_little_3(i32x4, i64) -> i32x4 { @@ -262,7 +262,7 @@ block0(v0: i32x4, v1: i64): } ; block0: -; vlebrf %v24, 0(%r2), 0 +; vlebrf %v24, 0(%r2), 3 ; br %r14 function %extractlane_i64x2_mem_0(i64x2, i64) { @@ -273,7 +273,7 @@ block0(v0: i64x2, v1: i64): } ; block0: -; vsteg %v24, 0(%r2), 1 +; vsteg %v24, 0(%r2), 0 ; br %r14 function %extractlane_i64x2_mem_1(i64x2, i64) { @@ -284,7 +284,7 @@ block0(v0: i64x2, v1: i64): } ; block0: -; vsteg %v24, 0(%r2), 0 +; vsteg %v24, 0(%r2), 1 ; br %r14 function %extractlane_i64x2_mem_little_0(i64x2, i64) { @@ -295,7 +295,7 @@ block0(v0: i64x2, v1: i64): } ; block0: -; vstebrg %v24, 0(%r2), 1 +; vstebrg %v24, 0(%r2), 0 ; br %r14 function %extractlane_i64x2_mem_little_1(i64x2, i64) { @@ -306,7 +306,7 @@ block0(v0: i64x2, v1: i64): } ; block0: -; vstebrg %v24, 0(%r2), 0 +; vstebrg %v24, 0(%r2), 1 ; br %r14 function %extractlane_i32x4_mem_0(i32x4, i64) { @@ -317,7 +317,7 @@ block0(v0: i32x4, v1: i64): } ; block0: -; vstef %v24, 0(%r2), 3 +; vstef %v24, 0(%r2), 0 ; br %r14 function %extractlane_i32x4_mem_3(i32x4, i64) { @@ -328,7 +328,7 @@ block0(v0: i32x4, v1: i64): } ; block0: -; vstef %v24, 0(%r2), 0 +; vstef %v24, 0(%r2), 3 ; br %r14 function %extractlane_i32x4_mem_little_0(i32x4, i64) { @@ -339,7 +339,7 @@ block0(v0: i32x4, v1: i64): } ; block0: -; vstebrf %v24, 0(%r2), 3 +; vstebrf %v24, 0(%r2), 0 ; br %r14 function %extractlane_i32x4_mem_little_3(i32x4, i64) { @@ -350,7 +350,7 @@ block0(v0: i32x4, v1: i64): } ; block0: -; vstebrf %v24, 0(%r2), 0 +; vstebrf %v24, 0(%r2), 3 ; br %r14 function %extractlane_i16x8_mem_0(i16x8, i64) { @@ -361,7 +361,7 @@ block0(v0: i16x8, v1: i64): } ; block0: -; vsteh %v24, 0(%r2), 7 +; vsteh %v24, 0(%r2), 0 ; br %r14 function %extractlane_i16x8_mem_7(i16x8, i64) { @@ -372,7 +372,7 @@ block0(v0: i16x8, v1: i64): } ; block0: -; vsteh %v24, 0(%r2), 0 +; vsteh %v24, 0(%r2), 7 ; br %r14 function %extractlane_i16x8_mem_little_0(i16x8, i64) { @@ -383,7 +383,7 @@ block0(v0: i16x8, v1: i64): } ; block0: -; vstebrh %v24, 0(%r2), 7 +; vstebrh %v24, 0(%r2), 0 ; br %r14 function %extractlane_i16x8_mem_little_7(i16x8, i64) { @@ -394,7 +394,7 @@ block0(v0: i16x8, v1: i64): } ; block0: -; vstebrh %v24, 0(%r2), 0 +; vstebrh %v24, 0(%r2), 7 ; br %r14 function %extractlane_i8x16_mem_0(i8x16, i64) { @@ -405,7 +405,7 @@ block0(v0: i8x16, v1: i64): } ; block0: -; vsteb %v24, 0(%r2), 15 +; vsteb %v24, 0(%r2), 0 ; br %r14 function %extractlane_i8x16_mem_15(i8x16, i64) { @@ -416,7 +416,7 @@ block0(v0: i8x16, v1: i64): } ; block0: -; vsteb %v24, 0(%r2), 0 +; vsteb %v24, 0(%r2), 15 ; br %r14 function %extractlane_i8x16_mem_little_0(i8x16, i64) { @@ -427,7 +427,7 @@ block0(v0: i8x16, v1: i64): } ; block0: -; vsteb %v24, 0(%r2), 15 +; vsteb %v24, 0(%r2), 0 ; br %r14 function %extractlane_i8x16_mem_little_15(i8x16, i64) { @@ -438,7 +438,7 @@ block0(v0: i8x16, v1: i64): } ; block0: -; vsteb %v24, 0(%r2), 0 +; vsteb %v24, 0(%r2), 15 ; br %r14 function %extractlane_f64x2_mem_0(f64x2, i64) { @@ -449,7 +449,7 @@ block0(v0: f64x2, v1: i64): } ; block0: -; vsteg %v24, 0(%r2), 1 +; vsteg %v24, 0(%r2), 0 ; br %r14 function %extractlane_f64x2_mem_1(f64x2, i64) { @@ -460,7 +460,7 @@ block0(v0: f64x2, v1: i64): } ; block0: -; vsteg %v24, 0(%r2), 0 +; vsteg %v24, 0(%r2), 1 ; br %r14 function %extractlane_f64x2_mem_little_0(f64x2, i64) { @@ -471,7 +471,7 @@ block0(v0: f64x2, v1: i64): } ; block0: -; vstebrg %v24, 0(%r2), 1 +; vstebrg %v24, 0(%r2), 0 ; br %r14 function %extractlane_f64x2_mem_little_1(f64x2, i64) { @@ -482,7 +482,7 @@ block0(v0: f64x2, v1: i64): } ; block0: -; vstebrg %v24, 0(%r2), 0 +; vstebrg %v24, 0(%r2), 1 ; br %r14 function %extractlane_f32x4_mem_0(f32x4, i64) { @@ -493,7 +493,7 @@ block0(v0: f32x4, v1: i64): } ; block0: -; vstef %v24, 0(%r2), 3 +; vstef %v24, 0(%r2), 0 ; br %r14 function %extractlane_f32x4_mem_3(f32x4, i64) { @@ -504,7 +504,7 @@ block0(v0: f32x4, v1: i64): } ; block0: -; vstef %v24, 0(%r2), 0 +; vstef %v24, 0(%r2), 3 ; br %r14 function %extractlane_f32x4_mem_little_0(f32x4, i64) { @@ -515,7 +515,7 @@ block0(v0: f32x4, v1: i64): } ; block0: -; vstebrf %v24, 0(%r2), 3 +; vstebrf %v24, 0(%r2), 0 ; br %r14 function %extractlane_f32x4_mem_little_3(f32x4, i64) { @@ -526,7 +526,7 @@ block0(v0: f32x4, v1: i64): } ; block0: -; vstebrf %v24, 0(%r2), 0 +; vstebrf %v24, 0(%r2), 3 ; br %r14 function %splat_i64x2_mem(i64) -> i64x2 { @@ -670,7 +670,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vleg %v24, 0(%r2), 1 +; vleg %v24, 0(%r2), 0 ; br %r14 function %scalar_to_vector_i64x2_mem_little(i64) -> i64x2 { @@ -682,7 +682,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vlebrg %v24, 0(%r2), 1 +; vlebrg %v24, 0(%r2), 0 ; br %r14 function %scalar_to_vector_i32x4_mem(i64) -> i32x4 { @@ -694,7 +694,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vlef %v24, 0(%r2), 3 +; vlef %v24, 0(%r2), 0 ; br %r14 function %scalar_to_vector_i32x4_mem_little(i64) -> i32x4 { @@ -706,7 +706,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vlebrf %v24, 0(%r2), 3 +; vlebrf %v24, 0(%r2), 0 ; br %r14 function %scalar_to_vector_i16x8_mem(i64) -> i16x8 { @@ -718,7 +718,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vleh %v24, 0(%r2), 7 +; vleh %v24, 0(%r2), 0 ; br %r14 function %scalar_to_vector_i16x8_mem_little(i64) -> i16x8 { @@ -730,7 +730,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vlebrh %v24, 0(%r2), 7 +; vlebrh %v24, 0(%r2), 0 ; br %r14 function %scalar_to_vector_i8x16_mem(i64) -> i8x16 { @@ -742,7 +742,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vleb %v24, 0(%r2), 15 +; vleb %v24, 0(%r2), 0 ; br %r14 function %scalar_to_vector_i8x16_mem_little(i64) -> i8x16 { @@ -754,7 +754,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vleb %v24, 0(%r2), 15 +; vleb %v24, 0(%r2), 0 ; br %r14 function %scalar_to_vector_f64x2_mem(i64) -> f64x2 { @@ -766,7 +766,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vleg %v24, 0(%r2), 1 +; vleg %v24, 0(%r2), 0 ; br %r14 function %scalar_to_vector_f64x2_mem_little(i64) -> f64x2 { @@ -778,7 +778,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vlebrg %v24, 0(%r2), 1 +; vlebrg %v24, 0(%r2), 0 ; br %r14 function %scalar_to_vector_f32x4_mem(i64) -> f32x4 { @@ -790,7 +790,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vlef %v24, 0(%r2), 3 +; vlef %v24, 0(%r2), 0 ; br %r14 function %scalar_to_vector_f32x4_mem_little(i64) -> f32x4 { @@ -802,6 +802,6 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vlebrf %v24, 0(%r2), 3 +; vlebrf %v24, 0(%r2), 0 ; br %r14 diff --git a/cranelift/filetests/filetests/isa/s390x/vec-lane-le-lane-arch13.clif b/cranelift/filetests/filetests/isa/s390x/vec-lane-le-lane-arch13.clif new file mode 100644 index 000000000000..19fbc0827e2d --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-lane-le-lane-arch13.clif @@ -0,0 +1,807 @@ +test compile precise-output +target s390x arch13 + +function %insertlane_i64x2_mem_0(i64x2, i64) -> i64x2 wasmtime_system_v { +block0(v0: i64x2, v1: i64): + v2 = load.i64 v1 + v3 = insertlane.i64x2 v0, v2, 0 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %insertlane_i64x2_mem_1(i64x2, i64) -> i64x2 wasmtime_system_v { +block0(v0: i64x2, v1: i64): + v2 = load.i64 v1 + v3 = insertlane.i64x2 v0, v2, 1 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i64x2_mem_little_0(i64x2, i64) -> i64x2 wasmtime_system_v { +block0(v0: i64x2, v1: i64): + v2 = load.i64 little v1 + v3 = insertlane.i64x2 v0, v2, 0 + return v3 +} + +; block0: +; vlebrg %v24, 0(%r2), 1 +; br %r14 + +function %insertlane_i64x2_mem_little_1(i64x2, i64) -> i64x2 wasmtime_system_v { +block0(v0: i64x2, v1: i64): + v2 = load.i64 little v1 + v3 = insertlane.i64x2 v0, v2, 1 + return v3 +} + +; block0: +; vlebrg %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i32x4_mem_0(i32x4, i64) -> i32x4 wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane.i32x4 v0, v2, 0 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %insertlane_i32x4_mem_3(i32x4, i64) -> i32x4 wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i32x4_mem_little_0(i32x4, i64) -> i32x4 wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = load.i32 little v1 + v3 = insertlane.i32x4 v0, v2, 0 + return v3 +} + +; block0: +; vlebrf %v24, 0(%r2), 3 +; br %r14 + +function %insertlane_i32x4_mem_little_3(i32x4, i64) -> i32x4 wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = load.i32 little v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vlebrf %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i16x8_mem_0(i16x8, i64) -> i16x8 wasmtime_system_v { +block0(v0: i16x8, v1: i64): + v2 = load.i16 v1 + v3 = insertlane.i16x8 v0, v2, 0 + return v3 +} + +; block0: +; vleh %v24, 0(%r2), 7 +; br %r14 + +function %insertlane_i16x8_mem_7(i16x8, i64) -> i16x8 wasmtime_system_v { +block0(v0: i16x8, v1: i64): + v2 = load.i16 v1 + v3 = insertlane.i16x8 v0, v2, 7 + return v3 +} + +; block0: +; vleh %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i16x8_mem_little_0(i16x8, i64) -> i16x8 wasmtime_system_v { +block0(v0: i16x8, v1: i64): + v2 = load.i16 little v1 + v3 = insertlane.i16x8 v0, v2, 0 + return v3 +} + +; block0: +; vlebrh %v24, 0(%r2), 7 +; br %r14 + +function %insertlane_i16x8_mem_little_7(i16x8, i64) -> i16x8 wasmtime_system_v { +block0(v0: i16x8, v1: i64): + v2 = load.i16 little v1 + v3 = insertlane.i16x8 v0, v2, 7 + return v3 +} + +; block0: +; vlebrh %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i8x16_mem_0(i8x16, i64) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i64): + v2 = load.i8 v1 + v3 = insertlane.i8x16 v0, v2, 0 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %insertlane_i8x16_mem_15(i8x16, i64) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i64): + v2 = load.i8 v1 + v3 = insertlane.i8x16 v0, v2, 15 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i8x16_mem_little_0(i8x16, i64) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i64): + v2 = load.i8 little v1 + v3 = insertlane.i8x16 v0, v2, 0 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %insertlane_i8x16_mem_little_15(i8x16, i64) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i64): + v2 = load.i8 little v1 + v3 = insertlane.i8x16 v0, v2, 15 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f64x2_mem_0(f64x2, i64) -> f64x2 wasmtime_system_v { +block0(v0: f64x2, v1: i64): + v2 = load.f64 v1 + v3 = insertlane.f64x2 v0, v2, 0 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %insertlane_f64x2_mem_1(f64x2, i64) -> f64x2 wasmtime_system_v { +block0(v0: f64x2, v1: i64): + v2 = load.f64 v1 + v3 = insertlane.f64x2 v0, v2, 1 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f64x2_mem_little_0(f64x2, i64) -> f64x2 wasmtime_system_v { +block0(v0: f64x2, v1: i64): + v2 = load.f64 little v1 + v3 = insertlane.f64x2 v0, v2, 0 + return v3 +} + +; block0: +; vlebrg %v24, 0(%r2), 1 +; br %r14 + +function %insertlane_f64x2_mem_little_1(f64x2, i64) -> f64x2 wasmtime_system_v { +block0(v0: f64x2, v1: i64): + v2 = load.f64 little v1 + v3 = insertlane.f64x2 v0, v2, 1 + return v3 +} + +; block0: +; vlebrg %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f32x4_mem_0(f32x4, i64) -> f32x4 wasmtime_system_v { +block0(v0: f32x4, v1: i64): + v2 = load.f32 v1 + v3 = insertlane.f32x4 v0, v2, 0 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %insertlane_i32x4_mem_3(i32x4, i64) -> i32x4 wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f32x4_mem_little_0(f32x4, i64) -> f32x4 wasmtime_system_v { +block0(v0: f32x4, v1: i64): + v2 = load.f32 little v1 + v3 = insertlane.f32x4 v0, v2, 0 + return v3 +} + +; block0: +; vlebrf %v24, 0(%r2), 3 +; br %r14 + +function %insertlane_i32x4_mem_little_3(i32x4, i64) -> i32x4 wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = load.i32 little v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vlebrf %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i64x2_mem_0(i64x2, i64) wasmtime_system_v { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 1 +; br %r14 + +function %extractlane_i64x2_mem_1(i64x2, i64) wasmtime_system_v { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 1 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i64x2_mem_little_0(i64x2, i64) wasmtime_system_v { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 0 + store little v2, v1 + return +} + +; block0: +; vstebrg %v24, 0(%r2), 1 +; br %r14 + +function %extractlane_i64x2_mem_little_1(i64x2, i64) wasmtime_system_v { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 1 + store little v2, v1 + return +} + +; block0: +; vstebrg %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i32x4_mem_0(i32x4, i64) wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 0 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 3 +; br %r14 + +function %extractlane_i32x4_mem_3(i32x4, i64) wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 3 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i32x4_mem_little_0(i32x4, i64) wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 0 + store little v2, v1 + return +} + +; block0: +; vstebrf %v24, 0(%r2), 3 +; br %r14 + +function %extractlane_i32x4_mem_little_3(i32x4, i64) wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 3 + store little v2, v1 + return +} + +; block0: +; vstebrf %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i16x8_mem_0(i16x8, i64) wasmtime_system_v { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteh %v24, 0(%r2), 7 +; br %r14 + +function %extractlane_i16x8_mem_7(i16x8, i64) wasmtime_system_v { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 7 + store v2, v1 + return +} + +; block0: +; vsteh %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i16x8_mem_little_0(i16x8, i64) wasmtime_system_v { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 0 + store little v2, v1 + return +} + +; block0: +; vstebrh %v24, 0(%r2), 7 +; br %r14 + +function %extractlane_i16x8_mem_little_7(i16x8, i64) wasmtime_system_v { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 7 + store little v2, v1 + return +} + +; block0: +; vstebrh %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i8x16_mem_0(i8x16, i64) wasmtime_system_v { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 15 +; br %r14 + +function %extractlane_i8x16_mem_15(i8x16, i64) wasmtime_system_v { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 15 + store v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i8x16_mem_little_0(i8x16, i64) wasmtime_system_v { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 0 + store little v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 15 +; br %r14 + +function %extractlane_i8x16_mem_little_15(i8x16, i64) wasmtime_system_v { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 15 + store little v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f64x2_mem_0(f64x2, i64) wasmtime_system_v { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 1 +; br %r14 + +function %extractlane_f64x2_mem_1(f64x2, i64) wasmtime_system_v { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 1 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f64x2_mem_little_0(f64x2, i64) wasmtime_system_v { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 0 + store little v2, v1 + return +} + +; block0: +; vstebrg %v24, 0(%r2), 1 +; br %r14 + +function %extractlane_f64x2_mem_little_1(f64x2, i64) wasmtime_system_v { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 1 + store little v2, v1 + return +} + +; block0: +; vstebrg %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f32x4_mem_0(f32x4, i64) wasmtime_system_v { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 0 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 3 +; br %r14 + +function %extractlane_f32x4_mem_3(f32x4, i64) wasmtime_system_v { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 3 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f32x4_mem_little_0(f32x4, i64) wasmtime_system_v { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 0 + store little v2, v1 + return +} + +; block0: +; vstebrf %v24, 0(%r2), 3 +; br %r14 + +function %extractlane_f32x4_mem_little_3(f32x4, i64) wasmtime_system_v { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 3 + store little v2, v1 + return +} + +; block0: +; vstebrf %v24, 0(%r2), 0 +; br %r14 + +function %splat_i64x2_mem(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.i64 v0 + v2 = splat.i64x2 v1 + return v2 +} + +; block0: +; vlrepg %v24, 0(%r2) +; br %r14 + +function %splat_i64x2_mem_little(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.i64 little v0 + v2 = splat.i64x2 v1 + return v2 +} + +; block0: +; vlbrrepg %v24, 0(%r2) +; br %r14 + +function %splat_i32x4_mem(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.i32 v0 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; vlrepf %v24, 0(%r2) +; br %r14 + +function %splat_i32x4_mem_little(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.i32 little v0 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; vlbrrepf %v24, 0(%r2) +; br %r14 + +function %splat_i16x8_mem(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = load.i16 v0 + v2 = splat.i16x8 v1 + return v2 +} + +; block0: +; vlreph %v24, 0(%r2) +; br %r14 + +function %splat_i16x8_mem_little(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = load.i16 little v0 + v2 = splat.i16x8 v1 + return v2 +} + +; block0: +; vlbrreph %v24, 0(%r2) +; br %r14 + +function %splat_i8x16_mem(i64) -> i8x16 wasmtime_system_v { +block0(v0: i64): + v1 = load.i8 v0 + v2 = splat.i8x16 v1 + return v2 +} + +; block0: +; vlrepb %v24, 0(%r2) +; br %r14 + +function %splat_i8x16_mem_little(i64) -> i8x16 wasmtime_system_v { +block0(v0: i64): + v1 = load.i8 little v0 + v2 = splat.i8x16 v1 + return v2 +} + +; block0: +; vlrepb %v24, 0(%r2) +; br %r14 + +function %splat_f64x2_mem(i64) -> f64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.f64 v0 + v2 = splat.f64x2 v1 + return v2 +} + +; block0: +; vlrepg %v24, 0(%r2) +; br %r14 + +function %splat_f64x2_mem_little(i64) -> f64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.f64 little v0 + v2 = splat.f64x2 v1 + return v2 +} + +; block0: +; vlbrrepg %v24, 0(%r2) +; br %r14 + +function %splat_f32x4_mem(i64) -> f32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.f32 v0 + v2 = splat.f32x4 v1 + return v2 +} + +; block0: +; vlrepf %v24, 0(%r2) +; br %r14 + +function %splat_f32x4_mem_little(i64) -> f32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.f32 little v0 + v2 = splat.f32x4 v1 + return v2 +} + +; block0: +; vlbrrepf %v24, 0(%r2) +; br %r14 + +function %scalar_to_vector_i64x2_mem(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.i64 v0 + v2 = scalar_to_vector.i64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %scalar_to_vector_i64x2_mem_little(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.i64 little v0 + v2 = scalar_to_vector.i64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlebrg %v24, 0(%r2), 1 +; br %r14 + +function %scalar_to_vector_i32x4_mem(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.i32 v0 + v2 = scalar_to_vector.i32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %scalar_to_vector_i32x4_mem_little(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.i32 little v0 + v2 = scalar_to_vector.i32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlebrf %v24, 0(%r2), 3 +; br %r14 + +function %scalar_to_vector_i16x8_mem(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = load.i16 v0 + v2 = scalar_to_vector.i16x8 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleh %v24, 0(%r2), 7 +; br %r14 + +function %scalar_to_vector_i16x8_mem_little(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = load.i16 little v0 + v2 = scalar_to_vector.i16x8 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlebrh %v24, 0(%r2), 7 +; br %r14 + +function %scalar_to_vector_i8x16_mem(i64) -> i8x16 wasmtime_system_v { +block0(v0: i64): + v1 = load.i8 v0 + v2 = scalar_to_vector.i8x16 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %scalar_to_vector_i8x16_mem_little(i64) -> i8x16 wasmtime_system_v { +block0(v0: i64): + v1 = load.i8 little v0 + v2 = scalar_to_vector.i8x16 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %scalar_to_vector_f64x2_mem(i64) -> f64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.f64 v0 + v2 = scalar_to_vector.f64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %scalar_to_vector_f64x2_mem_little(i64) -> f64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.f64 little v0 + v2 = scalar_to_vector.f64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlebrg %v24, 0(%r2), 1 +; br %r14 + +function %scalar_to_vector_f32x4_mem(i64) -> f32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.f32 v0 + v2 = scalar_to_vector.f32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %scalar_to_vector_f32x4_mem_little(i64) -> f32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.f32 little v0 + v2 = scalar_to_vector.f32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlebrf %v24, 0(%r2), 3 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-lane-le-lane.clif b/cranelift/filetests/filetests/isa/s390x/vec-lane-le-lane.clif new file mode 100644 index 000000000000..8e1fb6ad921d --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-lane-le-lane.clif @@ -0,0 +1,1964 @@ +test compile precise-output +target s390x + +function %insertlane_i64x2_0(i64x2, i64) -> i64x2 wasmtime_system_v { +block0(v0: i64x2, v1: i64): + v2 = insertlane.i64x2 v0, v1, 0 + return v2 +} + +; block0: +; vlvgg %v24, %r2, 1 +; br %r14 + +function %insertlane_i64x2_1(i64x2, i64) -> i64x2 wasmtime_system_v { +block0(v0: i64x2, v1: i64): + v2 = insertlane.i64x2 v0, v1, 1 + return v2 +} + +; block0: +; vlvgg %v24, %r2, 0 +; br %r14 + +function %insertlane_i64x2_imm_0(i64x2) -> i64x2 wasmtime_system_v { +block0(v0: i64x2): + v1 = iconst.i64 123 + v2 = insertlane.i64x2 v0, v1, 0 + return v2 +} + +; block0: +; vleig %v24, 123, 1 +; br %r14 + +function %insertlane_i64x2_imm_1(i64x2) -> i64x2 wasmtime_system_v { +block0(v0: i64x2): + v1 = iconst.i64 123 + v2 = insertlane.i64x2 v0, v1, 1 + return v2 +} + +; block0: +; vleig %v24, 123, 0 +; br %r14 + +function %insertlane_i64x2_lane_0_0(i64x2, i64x2) -> i64x2 wasmtime_system_v { +block0(v0: i64x2, v1: i64x2): + v2 = extractlane.i64x2 v1, 0 + v3 = insertlane.i64x2 v0, v2, 0 + return v3 +} + +; block0: +; vpdi %v24, %v24, %v25, 1 +; br %r14 + +function %insertlane_i64x2_lane_0_1(i64x2, i64x2) -> i64x2 wasmtime_system_v { +block0(v0: i64x2, v1: i64x2): + v2 = extractlane.i64x2 v1, 0 + v3 = insertlane.i64x2 v0, v2, 1 + return v3 +} + +; block0: +; vpdi %v24, %v25, %v24, 5 +; br %r14 + +function %insertlane_i64x2_lane_1_0(i64x2, i64x2) -> i64x2 wasmtime_system_v { +block0(v0: i64x2, v1: i64x2): + v2 = extractlane.i64x2 v1, 1 + v3 = insertlane.i64x2 v0, v2, 0 + return v3 +} + +; block0: +; vpdi %v24, %v24, %v25, 0 +; br %r14 + +function %insertlane_i64x2_lane_1_1(i64x2, i64x2) -> i64x2 wasmtime_system_v { +block0(v0: i64x2, v1: i64x2): + v2 = extractlane.i64x2 v1, 1 + v3 = insertlane.i64x2 v0, v2, 1 + return v3 +} + +; block0: +; vpdi %v24, %v25, %v24, 1 +; br %r14 + +function %insertlane_i64x2_mem_0(i64x2, i64) -> i64x2 wasmtime_system_v { +block0(v0: i64x2, v1: i64): + v2 = load.i64 v1 + v3 = insertlane.i64x2 v0, v2, 0 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %insertlane_i64x2_mem_1(i64x2, i64) -> i64x2 wasmtime_system_v { +block0(v0: i64x2, v1: i64): + v2 = load.i64 v1 + v3 = insertlane.i64x2 v0, v2, 1 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i64x2_mem_little_0(i64x2, i64) -> i64x2 wasmtime_system_v { +block0(v0: i64x2, v1: i64): + v2 = load.i64 little v1 + v3 = insertlane.i64x2 v0, v2, 0 + return v3 +} + +; block0: +; lrvg %r3, 0(%r2) +; vlvgg %v24, %r3, 1 +; br %r14 + +function %insertlane_i64x2_mem_little_1(i64x2, i64) -> i64x2 wasmtime_system_v { +block0(v0: i64x2, v1: i64): + v2 = load.i64 little v1 + v3 = insertlane.i64x2 v0, v2, 1 + return v3 +} + +; block0: +; lrvg %r3, 0(%r2) +; vlvgg %v24, %r3, 0 +; br %r14 + +function %insertlane_i32x4_0(i32x4, i32) -> i32x4 wasmtime_system_v { +block0(v0: i32x4, v1: i32): + v2 = insertlane.i32x4 v0, v1, 0 + return v2 +} + +; block0: +; vlvgf %v24, %r2, 3 +; br %r14 + +function %insertlane_i32x4_3(i32x4, i32) -> i32x4 wasmtime_system_v { +block0(v0: i32x4, v1: i32): + v2 = insertlane.i32x4 v0, v1, 3 + return v2 +} + +; block0: +; vlvgf %v24, %r2, 0 +; br %r14 + +function %insertlane_i32x4_imm_0(i32x4) -> i32x4 wasmtime_system_v { +block0(v0: i32x4): + v1 = iconst.i32 123 + v2 = insertlane.i32x4 v0, v1, 0 + return v2 +} + +; block0: +; vleif %v24, 123, 3 +; br %r14 + +function %insertlane_i32x4_imm_3(i32x4) -> i32x4 wasmtime_system_v { +block0(v0: i32x4): + v1 = iconst.i32 123 + v2 = insertlane.i32x4 v0, v1, 3 + return v2 +} + +; block0: +; vleif %v24, 123, 0 +; br %r14 + +function %insertlane_i32x4_lane_0_0(i32x4, i32x4) -> i32x4 wasmtime_system_v { +block0(v0: i32x4, v1: i32x4): + v2 = extractlane.i32x4 v1, 0 + v3 = insertlane.i32x4 v0, v2, 0 + return v3 +} + +; block0: +; vgbm %v5, 15 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_i32x4_lane_0_3(i32x4, i32x4) -> i32x4 wasmtime_system_v { +block0(v0: i32x4, v1: i32x4): + v2 = extractlane.i32x4 v1, 0 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vrepf %v5, %v25, 3 +; vgbm %v7, 61440 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_i32x4_lane_3_0(i32x4, i32x4) -> i32x4 wasmtime_system_v { +block0(v0: i32x4, v1: i32x4): + v2 = extractlane.i32x4 v1, 3 + v3 = insertlane.i32x4 v0, v2, 0 + return v3 +} + +; block0: +; vrepf %v5, %v25, 0 +; vgbm %v7, 15 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_i32x4_lane_3_3(i32x4, i32x4) -> i32x4 wasmtime_system_v { +block0(v0: i32x4, v1: i32x4): + v2 = extractlane.i32x4 v1, 3 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vgbm %v5, 61440 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_i32x4_mem_0(i32x4, i64) -> i32x4 wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane.i32x4 v0, v2, 0 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %insertlane_i32x4_mem_3(i32x4, i64) -> i32x4 wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i32x4_mem_little_0(i32x4, i64) -> i32x4 wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = load.i32 little v1 + v3 = insertlane.i32x4 v0, v2, 0 + return v3 +} + +; block0: +; lrv %r3, 0(%r2) +; vlvgf %v24, %r3, 3 +; br %r14 + +function %insertlane_i32x4_mem_little_3(i32x4, i64) -> i32x4 wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = load.i32 little v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; lrv %r3, 0(%r2) +; vlvgf %v24, %r3, 0 +; br %r14 + +function %insertlane_i16x8_0(i16x8, i16) -> i16x8 wasmtime_system_v { +block0(v0: i16x8, v1: i16): + v2 = insertlane.i16x8 v0, v1, 0 + return v2 +} + +; block0: +; vlvgh %v24, %r2, 7 +; br %r14 + +function %insertlane_i16x8_7(i16x8, i16) -> i16x8 wasmtime_system_v { +block0(v0: i16x8, v1: i16): + v2 = insertlane.i16x8 v0, v1, 7 + return v2 +} + +; block0: +; vlvgh %v24, %r2, 0 +; br %r14 + +function %insertlane_i16x8_imm_0(i16x8) -> i16x8 wasmtime_system_v { +block0(v0: i16x8): + v1 = iconst.i16 123 + v2 = insertlane.i16x8 v0, v1, 0 + return v2 +} + +; block0: +; vleih %v24, 123, 7 +; br %r14 + +function %insertlane_i16x8_imm_7(i16x8) -> i16x8 wasmtime_system_v { +block0(v0: i16x8): + v1 = iconst.i16 123 + v2 = insertlane.i16x8 v0, v1, 7 + return v2 +} + +; block0: +; vleih %v24, 123, 0 +; br %r14 + +function %insertlane_i16x8_lane_0_0(i16x8, i16x8) -> i16x8 wasmtime_system_v { +block0(v0: i16x8, v1: i16x8): + v2 = extractlane.i16x8 v1, 0 + v3 = insertlane.i16x8 v0, v2, 0 + return v3 +} + +; block0: +; vgbm %v5, 3 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_i16x8_lane_0_7(i16x8, i16x8) -> i16x8 wasmtime_system_v { +block0(v0: i16x8, v1: i16x8): + v2 = extractlane.i16x8 v1, 0 + v3 = insertlane.i16x8 v0, v2, 7 + return v3 +} + +; block0: +; vreph %v5, %v25, 7 +; vgbm %v7, 49152 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_i16x8_lane_7_0(i16x8, i16x8) -> i16x8 wasmtime_system_v { +block0(v0: i16x8, v1: i16x8): + v2 = extractlane.i16x8 v1, 7 + v3 = insertlane.i16x8 v0, v2, 0 + return v3 +} + +; block0: +; vreph %v5, %v25, 0 +; vgbm %v7, 3 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_i16x8_lane_7_7(i16x8, i16x8) -> i16x8 wasmtime_system_v { +block0(v0: i16x8, v1: i16x8): + v2 = extractlane.i16x8 v1, 7 + v3 = insertlane.i16x8 v0, v2, 7 + return v3 +} + +; block0: +; vgbm %v5, 49152 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_i16x8_mem_0(i16x8, i64) -> i16x8 wasmtime_system_v { +block0(v0: i16x8, v1: i64): + v2 = load.i16 v1 + v3 = insertlane.i16x8 v0, v2, 0 + return v3 +} + +; block0: +; vleh %v24, 0(%r2), 7 +; br %r14 + +function %insertlane_i16x8_mem_7(i16x8, i64) -> i16x8 wasmtime_system_v { +block0(v0: i16x8, v1: i64): + v2 = load.i16 v1 + v3 = insertlane.i16x8 v0, v2, 7 + return v3 +} + +; block0: +; vleh %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i16x8_mem_little_0(i16x8, i64) -> i16x8 wasmtime_system_v { +block0(v0: i16x8, v1: i64): + v2 = load.i16 little v1 + v3 = insertlane.i16x8 v0, v2, 0 + return v3 +} + +; block0: +; lrvh %r3, 0(%r2) +; vlvgh %v24, %r3, 7 +; br %r14 + +function %insertlane_i16x8_mem_little_7(i16x8, i64) -> i16x8 wasmtime_system_v { +block0(v0: i16x8, v1: i64): + v2 = load.i16 little v1 + v3 = insertlane.i16x8 v0, v2, 7 + return v3 +} + +; block0: +; lrvh %r3, 0(%r2) +; vlvgh %v24, %r3, 0 +; br %r14 + +function %insertlane_i8x16_0(i8x16, i8) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8): + v2 = insertlane.i8x16 v0, v1, 0 + return v2 +} + +; block0: +; vlvgb %v24, %r2, 15 +; br %r14 + +function %insertlane_i8x16_15(i8x16, i8) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8): + v2 = insertlane.i8x16 v0, v1, 15 + return v2 +} + +; block0: +; vlvgb %v24, %r2, 0 +; br %r14 + +function %insertlane_i8x16_imm_0(i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16): + v1 = iconst.i8 123 + v2 = insertlane.i8x16 v0, v1, 0 + return v2 +} + +; block0: +; vleib %v24, 123, 15 +; br %r14 + +function %insertlane_i8x16_imm_15(i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16): + v1 = iconst.i8 123 + v2 = insertlane.i8x16 v0, v1, 15 + return v2 +} + +; block0: +; vleib %v24, 123, 0 +; br %r14 + +function %insertlane_i8x16_lane_0_0(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = extractlane.i8x16 v1, 0 + v3 = insertlane.i8x16 v0, v2, 0 + return v3 +} + +; block0: +; vgbm %v5, 1 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_i8x16_lane_0_15(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = extractlane.i8x16 v1, 0 + v3 = insertlane.i8x16 v0, v2, 15 + return v3 +} + +; block0: +; vrepb %v5, %v25, 15 +; vgbm %v7, 32768 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_i8x16_lane_15_0(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = extractlane.i8x16 v1, 15 + v3 = insertlane.i8x16 v0, v2, 0 + return v3 +} + +; block0: +; vrepb %v5, %v25, 0 +; vgbm %v7, 1 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_i8x16_lane_15_15(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = extractlane.i8x16 v1, 15 + v3 = insertlane.i8x16 v0, v2, 15 + return v3 +} + +; block0: +; vgbm %v5, 32768 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_i8x16_mem_0(i8x16, i64) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i64): + v2 = load.i8 v1 + v3 = insertlane.i8x16 v0, v2, 0 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %insertlane_i8x16_mem_15(i8x16, i64) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i64): + v2 = load.i8 v1 + v3 = insertlane.i8x16 v0, v2, 15 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_i8x16_mem_little_0(i8x16, i64) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i64): + v2 = load.i8 little v1 + v3 = insertlane.i8x16 v0, v2, 0 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %insertlane_i8x16_mem_little_15(i8x16, i64) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i64): + v2 = load.i8 little v1 + v3 = insertlane.i8x16 v0, v2, 15 + return v3 +} + +; block0: +; vleb %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f64x2_0(f64x2, f64) -> f64x2 wasmtime_system_v { +block0(v0: f64x2, v1: f64): + v2 = insertlane.f64x2 v0, v1, 0 + return v2 +} + +; block0: +; vpdi %v24, %v24, %v0, 0 +; br %r14 + +function %insertlane_f64x2_1(f64x2, f64) -> f64x2 wasmtime_system_v { +block0(v0: f64x2, v1: f64): + v2 = insertlane.f64x2 v0, v1, 1 + return v2 +} + +; block0: +; vpdi %v24, %v0, %v24, 1 +; br %r14 + +function %insertlane_f64x2_lane_0_0(f64x2, f64x2) -> f64x2 wasmtime_system_v { +block0(v0: f64x2, v1: f64x2): + v2 = extractlane.f64x2 v1, 0 + v3 = insertlane.f64x2 v0, v2, 0 + return v3 +} + +; block0: +; vpdi %v24, %v24, %v25, 1 +; br %r14 + +function %insertlane_f64x2_lane_0_1(f64x2, f64x2) -> f64x2 wasmtime_system_v { +block0(v0: f64x2, v1: f64x2): + v2 = extractlane.f64x2 v1, 0 + v3 = insertlane.f64x2 v0, v2, 1 + return v3 +} + +; block0: +; vpdi %v24, %v25, %v24, 5 +; br %r14 + +function %insertlane_f64x2_lane_1_0(f64x2, f64x2) -> f64x2 wasmtime_system_v { +block0(v0: f64x2, v1: f64x2): + v2 = extractlane.f64x2 v1, 1 + v3 = insertlane.f64x2 v0, v2, 0 + return v3 +} + +; block0: +; vpdi %v24, %v24, %v25, 0 +; br %r14 + +function %insertlane_f64x2_lane_1_1(f64x2, f64x2) -> f64x2 wasmtime_system_v { +block0(v0: f64x2, v1: f64x2): + v2 = extractlane.f64x2 v1, 1 + v3 = insertlane.f64x2 v0, v2, 1 + return v3 +} + +; block0: +; vpdi %v24, %v25, %v24, 1 +; br %r14 + +function %insertlane_f64x2_mem_0(f64x2, i64) -> f64x2 wasmtime_system_v { +block0(v0: f64x2, v1: i64): + v2 = load.f64 v1 + v3 = insertlane.f64x2 v0, v2, 0 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %insertlane_f64x2_mem_1(f64x2, i64) -> f64x2 wasmtime_system_v { +block0(v0: f64x2, v1: i64): + v2 = load.f64 v1 + v3 = insertlane.f64x2 v0, v2, 1 + return v3 +} + +; block0: +; vleg %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f64x2_mem_little_0(f64x2, i64) -> f64x2 wasmtime_system_v { +block0(v0: f64x2, v1: i64): + v2 = load.f64 little v1 + v3 = insertlane.f64x2 v0, v2, 0 + return v3 +} + +; block0: +; lrvg %r3, 0(%r2) +; vlvgg %v24, %r3, 1 +; br %r14 + +function %insertlane_f64x2_mem_little_1(f64x2, i64) -> f64x2 wasmtime_system_v { +block0(v0: f64x2, v1: i64): + v2 = load.f64 little v1 + v3 = insertlane.f64x2 v0, v2, 1 + return v3 +} + +; block0: +; lrvg %r3, 0(%r2) +; vlvgg %v24, %r3, 0 +; br %r14 + +function %insertlane_f32x4_0(f32x4, f32) -> f32x4 wasmtime_system_v { +block0(v0: f32x4, v1: f32): + v2 = insertlane.f32x4 v0, v1, 0 + return v2 +} + +; block0: +; vrepf %v5, %v0, 0 +; vgbm %v7, 15 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_f32x4_3(f32x4, f32) -> f32x4 wasmtime_system_v { +block0(v0: f32x4, v1: f32): + v2 = insertlane.f32x4 v0, v1, 3 + return v2 +} + +; block0: +; vgbm %v5, 61440 +; vsel %v24, %v0, %v24, %v5 +; br %r14 + +function %insertlane_f32x4_lane_0_0(f32x4, f32x4) -> f32x4 wasmtime_system_v { +block0(v0: f32x4, v1: f32x4): + v2 = extractlane.f32x4 v1, 0 + v3 = insertlane.f32x4 v0, v2, 0 + return v3 +} + +; block0: +; vgbm %v5, 15 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_f32x4_lane_0_3(f32x4, f32x4) -> f32x4 wasmtime_system_v { +block0(v0: f32x4, v1: f32x4): + v2 = extractlane.f32x4 v1, 0 + v3 = insertlane.f32x4 v0, v2, 3 + return v3 +} + +; block0: +; vrepf %v5, %v25, 3 +; vgbm %v7, 61440 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_f32x4_lane_3_0(f32x4, f32x4) -> f32x4 wasmtime_system_v { +block0(v0: f32x4, v1: f32x4): + v2 = extractlane.f32x4 v1, 3 + v3 = insertlane.f32x4 v0, v2, 0 + return v3 +} + +; block0: +; vrepf %v5, %v25, 0 +; vgbm %v7, 15 +; vsel %v24, %v5, %v24, %v7 +; br %r14 + +function %insertlane_f32x4_lane_3_3(f32x4, f32x4) -> f32x4 wasmtime_system_v { +block0(v0: f32x4, v1: f32x4): + v2 = extractlane.f32x4 v1, 3 + v3 = insertlane.f32x4 v0, v2, 3 + return v3 +} + +; block0: +; vgbm %v5, 61440 +; vsel %v24, %v25, %v24, %v5 +; br %r14 + +function %insertlane_f32x4_mem_0(f32x4, i64) -> f32x4 wasmtime_system_v { +block0(v0: f32x4, v1: i64): + v2 = load.f32 v1 + v3 = insertlane.f32x4 v0, v2, 0 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %insertlane_i32x4_mem_3(i32x4, i64) -> i32x4 wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; vlef %v24, 0(%r2), 0 +; br %r14 + +function %insertlane_f32x4_mem_little_0(f32x4, i64) -> f32x4 wasmtime_system_v { +block0(v0: f32x4, v1: i64): + v2 = load.f32 little v1 + v3 = insertlane.f32x4 v0, v2, 0 + return v3 +} + +; block0: +; lrv %r3, 0(%r2) +; vlvgf %v24, %r3, 3 +; br %r14 + +function %insertlane_i32x4_mem_little_3(i32x4, i64) -> i32x4 wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = load.i32 little v1 + v3 = insertlane.i32x4 v0, v2, 3 + return v3 +} + +; block0: +; lrv %r3, 0(%r2) +; vlvgf %v24, %r3, 0 +; br %r14 + +function %extractlane_i64x2_0(i64x2) -> i64 wasmtime_system_v { +block0(v0: i64x2): + v1 = extractlane.i64x2 v0, 0 + return v1 +} + +; block0: +; vlgvg %r2, %v24, 1 +; br %r14 + +function %extractlane_i64x2_1(i64x2) -> i64 wasmtime_system_v { +block0(v0: i64x2): + v1 = extractlane.i64x2 v0, 1 + return v1 +} + +; block0: +; vlgvg %r2, %v24, 0 +; br %r14 + +function %extractlane_i64x2_mem_0(i64x2, i64) wasmtime_system_v { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 1 +; br %r14 + +function %extractlane_i64x2_mem_1(i64x2, i64) wasmtime_system_v { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 1 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i64x2_mem_little_0(i64x2, i64) wasmtime_system_v { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 0 + store little v2, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; strvg %r3, 0(%r2) +; br %r14 + +function %extractlane_i64x2_mem_little_1(i64x2, i64) wasmtime_system_v { +block0(v0: i64x2, v1: i64): + v2 = extractlane.i64x2 v0, 1 + store little v2, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 0 +; strvg %r3, 0(%r2) +; br %r14 + +function %extractlane_i32x4_0(i32x4) -> i32 wasmtime_system_v { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 0 + return v1 +} + +; block0: +; vlgvf %r2, %v24, 3 +; br %r14 + +function %extractlane_i32x4_3(i32x4) -> i32 wasmtime_system_v { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 3 + return v1 +} + +; block0: +; vlgvf %r2, %v24, 0 +; br %r14 + +function %extractlane_i32x4_mem_0(i32x4, i64) wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 0 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 3 +; br %r14 + +function %extractlane_i32x4_mem_3(i32x4, i64) wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 3 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i32x4_mem_little_0(i32x4, i64) wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 0 + store little v2, v1 + return +} + +; block0: +; vlgvf %r3, %v24, 3 +; strv %r3, 0(%r2) +; br %r14 + +function %extractlane_i32x4_mem_little_3(i32x4, i64) wasmtime_system_v { +block0(v0: i32x4, v1: i64): + v2 = extractlane.i32x4 v0, 3 + store little v2, v1 + return +} + +; block0: +; vlgvf %r3, %v24, 0 +; strv %r3, 0(%r2) +; br %r14 + +function %extractlane_i16x8_0(i16x8) -> i16 wasmtime_system_v { +block0(v0: i16x8): + v1 = extractlane.i16x8 v0, 0 + return v1 +} + +; block0: +; vlgvh %r2, %v24, 7 +; br %r14 + +function %extractlane_i16x8_7(i16x8) -> i16 wasmtime_system_v { +block0(v0: i16x8): + v1 = extractlane.i16x8 v0, 7 + return v1 +} + +; block0: +; vlgvh %r2, %v24, 0 +; br %r14 + +function %extractlane_i16x8_mem_0(i16x8, i64) wasmtime_system_v { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteh %v24, 0(%r2), 7 +; br %r14 + +function %extractlane_i16x8_mem_7(i16x8, i64) wasmtime_system_v { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 7 + store v2, v1 + return +} + +; block0: +; vsteh %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i16x8_mem_little_0(i16x8, i64) wasmtime_system_v { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 0 + store little v2, v1 + return +} + +; block0: +; vlgvh %r3, %v24, 7 +; strvh %r3, 0(%r2) +; br %r14 + +function %extractlane_i16x8_mem_little_7(i16x8, i64) wasmtime_system_v { +block0(v0: i16x8, v1: i64): + v2 = extractlane.i16x8 v0, 7 + store little v2, v1 + return +} + +; block0: +; vlgvh %r3, %v24, 0 +; strvh %r3, 0(%r2) +; br %r14 + +function %extractlane_i8x16_0(i8x16) -> i8 wasmtime_system_v { +block0(v0: i8x16): + v1 = extractlane.i8x16 v0, 0 + return v1 +} + +; block0: +; vlgvb %r2, %v24, 15 +; br %r14 + +function %extractlane_i8x16_15(i8x16) -> i8 wasmtime_system_v { +block0(v0: i8x16): + v1 = extractlane.i8x16 v0, 15 + return v1 +} + +; block0: +; vlgvb %r2, %v24, 0 +; br %r14 + +function %extractlane_i8x16_mem_0(i8x16, i64) wasmtime_system_v { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 15 +; br %r14 + +function %extractlane_i8x16_mem_15(i8x16, i64) wasmtime_system_v { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 15 + store v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_i8x16_mem_little_0(i8x16, i64) wasmtime_system_v { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 0 + store little v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 15 +; br %r14 + +function %extractlane_i8x16_mem_little_15(i8x16, i64) wasmtime_system_v { +block0(v0: i8x16, v1: i64): + v2 = extractlane.i8x16 v0, 15 + store little v2, v1 + return +} + +; block0: +; vsteb %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f64x2_0(f64x2) -> f64 wasmtime_system_v { +block0(v0: f64x2): + v1 = extractlane.f64x2 v0, 0 + return v1 +} + +; block0: +; vrepg %v0, %v24, 1 +; br %r14 + +function %extractlane_f64x2_1(f64x2) -> f64 wasmtime_system_v { +block0(v0: f64x2): + v1 = extractlane.f64x2 v0, 1 + return v1 +} + +; block0: +; vrepg %v0, %v24, 0 +; br %r14 + +function %extractlane_f64x2_mem_0(f64x2, i64) wasmtime_system_v { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 0 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 1 +; br %r14 + +function %extractlane_f64x2_mem_1(f64x2, i64) wasmtime_system_v { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 1 + store v2, v1 + return +} + +; block0: +; vsteg %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f64x2_mem_little_0(f64x2, i64) wasmtime_system_v { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 0 + store little v2, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; strvg %r3, 0(%r2) +; br %r14 + +function %extractlane_f64x2_mem_little_1(f64x2, i64) wasmtime_system_v { +block0(v0: f64x2, v1: i64): + v2 = extractlane.f64x2 v0, 1 + store little v2, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 0 +; strvg %r3, 0(%r2) +; br %r14 + +function %extractlane_f32x4_0(f32x4) -> f32 wasmtime_system_v { +block0(v0: f32x4): + v1 = extractlane.f32x4 v0, 0 + return v1 +} + +; block0: +; vrepf %v0, %v24, 3 +; br %r14 + +function %extractlane_f32x4_3(f32x4) -> f32 wasmtime_system_v { +block0(v0: f32x4): + v1 = extractlane.f32x4 v0, 3 + return v1 +} + +; block0: +; vrepf %v0, %v24, 0 +; br %r14 + +function %extractlane_f32x4_mem_0(f32x4, i64) wasmtime_system_v { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 0 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 3 +; br %r14 + +function %extractlane_f32x4_mem_3(f32x4, i64) wasmtime_system_v { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 3 + store v2, v1 + return +} + +; block0: +; vstef %v24, 0(%r2), 0 +; br %r14 + +function %extractlane_f32x4_mem_little_0(f32x4, i64) wasmtime_system_v { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 0 + store little v2, v1 + return +} + +; block0: +; vlgvf %r3, %v24, 3 +; strv %r3, 0(%r2) +; br %r14 + +function %extractlane_f32x4_mem_little_3(f32x4, i64) wasmtime_system_v { +block0(v0: f32x4, v1: i64): + v2 = extractlane.f32x4 v0, 3 + store little v2, v1 + return +} + +; block0: +; vlgvf %r3, %v24, 0 +; strv %r3, 0(%r2) +; br %r14 + +function %splat_i64x2(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = splat.i64x2 v0 + return v1 +} + +; block0: +; ldgr %f3, %r2 +; vrepg %v24, %v3, 0 +; br %r14 + +function %splat_i64x2_imm() -> i64x2 wasmtime_system_v { +block0: + v0 = iconst.i64 123 + v1 = splat.i64x2 v0 + return v1 +} + +; block0: +; vrepig %v24, 123 +; br %r14 + +function %splat_i64x2_lane_0(i64x2) -> i64x2 wasmtime_system_v { +block0(v0: i64x2): + v1 = extractlane.i64x2 v0, 0 + v2 = splat.i64x2 v1 + return v2 +} + +; block0: +; vrepg %v24, %v24, 1 +; br %r14 + +function %splat_i64x2_lane_1(i64x2) -> i64x2 wasmtime_system_v { +block0(v0: i64x2): + v1 = extractlane.i64x2 v0, 1 + v2 = splat.i64x2 v1 + return v2 +} + +; block0: +; vrepg %v24, %v24, 0 +; br %r14 + +function %splat_i64x2_mem(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.i64 v0 + v2 = splat.i64x2 v1 + return v2 +} + +; block0: +; vlrepg %v24, 0(%r2) +; br %r14 + +function %splat_i64x2_mem_little(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.i64 little v0 + v2 = splat.i64x2 v1 + return v2 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vrepg %v24, %v5, 0 +; br %r14 + +function %splat_i32x4(i32) -> i32x4 wasmtime_system_v { +block0(v0: i32): + v1 = splat.i32x4 v0 + return v1 +} + +; block0: +; vlvgf %v3, %r2, 0 +; vrepf %v24, %v3, 0 +; br %r14 + +function %splat_i32x4_imm() -> i32x4 wasmtime_system_v { +block0: + v0 = iconst.i32 123 + v1 = splat.i32x4 v0 + return v1 +} + +; block0: +; vrepif %v24, 123 +; br %r14 + +function %splat_i32x4_lane_0(i32x4) -> i32x4 wasmtime_system_v { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 0 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; vrepf %v24, %v24, 3 +; br %r14 + +function %splat_i32x4_lane_3(i32x4) -> i32x4 wasmtime_system_v { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 3 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; vrepf %v24, %v24, 0 +; br %r14 + +function %splat_i32x4_mem(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.i32 v0 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; vlrepf %v24, 0(%r2) +; br %r14 + +function %splat_i32x4_mem_little(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.i32 little v0 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; lrv %r5, 0(%r2) +; vlvgf %v5, %r5, 0 +; vrepf %v24, %v5, 0 +; br %r14 + +function %splat_i16x8(i16) -> i16x8 wasmtime_system_v { +block0(v0: i16): + v1 = splat.i16x8 v0 + return v1 +} + +; block0: +; vlvgh %v3, %r2, 0 +; vreph %v24, %v3, 0 +; br %r14 + +function %splat_i16x8_imm() -> i16x8 wasmtime_system_v { +block0: + v0 = iconst.i16 123 + v1 = splat.i16x8 v0 + return v1 +} + +; block0: +; vrepih %v24, 123 +; br %r14 + +function %splat_i16x8_lane_0(i16x8) -> i16x8 wasmtime_system_v { +block0(v0: i16x8): + v1 = extractlane.i16x8 v0, 0 + v2 = splat.i16x8 v1 + return v2 +} + +; block0: +; vreph %v24, %v24, 7 +; br %r14 + +function %splat_i16x8_lane_7(i16x8) -> i16x8 wasmtime_system_v { +block0(v0: i16x8): + v1 = extractlane.i16x8 v0, 7 + v2 = splat.i16x8 v1 + return v2 +} + +; block0: +; vreph %v24, %v24, 0 +; br %r14 + +function %splat_i16x8_mem(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = load.i16 v0 + v2 = splat.i16x8 v1 + return v2 +} + +; block0: +; vlreph %v24, 0(%r2) +; br %r14 + +function %splat_i16x8_mem_little(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = load.i16 little v0 + v2 = splat.i16x8 v1 + return v2 +} + +; block0: +; lrvh %r5, 0(%r2) +; vlvgh %v5, %r5, 0 +; vreph %v24, %v5, 0 +; br %r14 + +function %splat_i8x16(i8) -> i8x16 wasmtime_system_v { +block0(v0: i8): + v1 = splat.i8x16 v0 + return v1 +} + +; block0: +; vlvgb %v3, %r2, 0 +; vrepb %v24, %v3, 0 +; br %r14 + +function %splat_i8x16_imm() -> i8x16 wasmtime_system_v { +block0: + v0 = iconst.i8 123 + v1 = splat.i8x16 v0 + return v1 +} + +; block0: +; vrepib %v24, 123 +; br %r14 + +function %splat_i8x16_lane_0(i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16): + v1 = extractlane.i8x16 v0, 0 + v2 = splat.i8x16 v1 + return v2 +} + +; block0: +; vrepb %v24, %v24, 15 +; br %r14 + +function %splat_i8x16_lane_15(i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16): + v1 = extractlane.i8x16 v0, 15 + v2 = splat.i8x16 v1 + return v2 +} + +; block0: +; vrepb %v24, %v24, 0 +; br %r14 + +function %splat_i8x16_mem(i64) -> i8x16 wasmtime_system_v { +block0(v0: i64): + v1 = load.i8 v0 + v2 = splat.i8x16 v1 + return v2 +} + +; block0: +; vlrepb %v24, 0(%r2) +; br %r14 + +function %splat_i8x16_mem_little(i64) -> i8x16 wasmtime_system_v { +block0(v0: i64): + v1 = load.i8 little v0 + v2 = splat.i8x16 v1 + return v2 +} + +; block0: +; vlrepb %v24, 0(%r2) +; br %r14 + +function %splat_f64x2(f64) -> f64x2 wasmtime_system_v { +block0(v0: f64): + v1 = splat.f64x2 v0 + return v1 +} + +; block0: +; vrepg %v24, %v0, 0 +; br %r14 + +function %splat_f64x2_lane_0(f64x2) -> f64x2 wasmtime_system_v { +block0(v0: f64x2): + v1 = extractlane.f64x2 v0, 0 + v2 = splat.f64x2 v1 + return v2 +} + +; block0: +; vrepg %v24, %v24, 1 +; br %r14 + +function %splat_f64x2_lane_1(f64x2) -> f64x2 wasmtime_system_v { +block0(v0: f64x2): + v1 = extractlane.f64x2 v0, 1 + v2 = splat.f64x2 v1 + return v2 +} + +; block0: +; vrepg %v24, %v24, 0 +; br %r14 + +function %splat_f64x2_mem(i64) -> f64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.f64 v0 + v2 = splat.f64x2 v1 + return v2 +} + +; block0: +; vlrepg %v24, 0(%r2) +; br %r14 + +function %splat_f64x2_mem_little(i64) -> f64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.f64 little v0 + v2 = splat.f64x2 v1 + return v2 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vrepg %v24, %v5, 0 +; br %r14 + +function %splat_f32x4(f32) -> f32x4 wasmtime_system_v { +block0(v0: f32): + v1 = splat.f32x4 v0 + return v1 +} + +; block0: +; vrepf %v24, %v0, 0 +; br %r14 + +function %splat_f32x4_lane_0(f32x4) -> f32x4 wasmtime_system_v { +block0(v0: f32x4): + v1 = extractlane.f32x4 v0, 0 + v2 = splat.f32x4 v1 + return v2 +} + +; block0: +; vrepf %v24, %v24, 3 +; br %r14 + +function %splat_i32x4_lane_3(i32x4) -> i32x4 wasmtime_system_v { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 3 + v2 = splat.i32x4 v1 + return v2 +} + +; block0: +; vrepf %v24, %v24, 0 +; br %r14 + +function %splat_f32x4_mem(i64) -> f32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.f32 v0 + v2 = splat.f32x4 v1 + return v2 +} + +; block0: +; vlrepf %v24, 0(%r2) +; br %r14 + +function %splat_f32x4_mem_little(i64) -> f32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.f32 little v0 + v2 = splat.f32x4 v1 + return v2 +} + +; block0: +; lrv %r5, 0(%r2) +; vlvgf %v5, %r5, 0 +; vrepf %v24, %v5, 0 +; br %r14 + +function %scalar_to_vector_i64x2(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = scalar_to_vector.i64x2 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vlvgg %v24, %r2, 1 +; br %r14 + +function %scalar_to_vector_i64x2_imm() -> i64x2 wasmtime_system_v { +block0: + v0 = iconst.i64 123 + v1 = scalar_to_vector.i64x2 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vleig %v24, 123, 1 +; br %r14 + +function %scalar_to_vector_i64x2_lane_0(i64x2) -> i64x2 wasmtime_system_v { +block0(v0: i64x2): + v1 = extractlane.i64x2 v0, 0 + v2 = scalar_to_vector.i64x2 v1 + return v2 +} + +; block0: +; vgbm %v3, 0 +; vpdi %v24, %v3, %v24, 1 +; br %r14 + +function %scalar_to_vector_i64x2_lane_1(i64x2) -> i64x2 wasmtime_system_v { +block0(v0: i64x2): + v1 = extractlane.i64x2 v0, 1 + v2 = scalar_to_vector.i64x2 v1 + return v2 +} + +; block0: +; vgbm %v3, 0 +; vpdi %v24, %v3, %v24, 0 +; br %r14 + +function %scalar_to_vector_i64x2_mem(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.i64 v0 + v2 = scalar_to_vector.i64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %scalar_to_vector_i64x2_mem_little(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.i64 little v0 + v2 = scalar_to_vector.i64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; lrvg %r3, 0(%r2) +; vlvgg %v24, %r3, 1 +; br %r14 + +function %scalar_to_vector_i32x4(i32) -> i32x4 wasmtime_system_v { +block0(v0: i32): + v1 = scalar_to_vector.i32x4 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vlvgf %v24, %r2, 3 +; br %r14 + +function %scalar_to_vector_i32x4_imm() -> i32x4 wasmtime_system_v { +block0: + v0 = iconst.i32 123 + v1 = scalar_to_vector.i32x4 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vleif %v24, 123, 3 +; br %r14 + +function %scalar_to_vector_i32x4_lane_0(i32x4) -> i32x4 wasmtime_system_v { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 0 + v2 = scalar_to_vector.i32x4 v1 + return v2 +} + +; block0: +; vgbm %v3, 15 +; vn %v24, %v24, %v3 +; br %r14 + +function %scalar_to_vector_i32x4_lane_3(i32x4) -> i32x4 wasmtime_system_v { +block0(v0: i32x4): + v1 = extractlane.i32x4 v0, 3 + v2 = scalar_to_vector.i32x4 v1 + return v2 +} + +; block0: +; vrepf %v3, %v24, 0 +; vgbm %v5, 15 +; vn %v24, %v3, %v5 +; br %r14 + +function %scalar_to_vector_i32x4_mem(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.i32 v0 + v2 = scalar_to_vector.i32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %scalar_to_vector_i32x4_mem_little(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.i32 little v0 + v2 = scalar_to_vector.i32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; lrv %r3, 0(%r2) +; vlvgf %v24, %r3, 3 +; br %r14 + +function %scalar_to_vector_i16x8(i16) -> i16x8 wasmtime_system_v { +block0(v0: i16): + v1 = scalar_to_vector.i16x8 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vlvgh %v24, %r2, 7 +; br %r14 + +function %scalar_to_vector_i16x8_imm() -> i16x8 wasmtime_system_v { +block0: + v0 = iconst.i16 123 + v1 = scalar_to_vector.i16x8 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vleih %v24, 123, 7 +; br %r14 + +function %scalar_to_vector_i16x8_lane_0(i16x8) -> i16x8 wasmtime_system_v { +block0(v0: i16x8): + v1 = extractlane.i16x8 v0, 0 + v2 = scalar_to_vector.i16x8 v1 + return v2 +} + +; block0: +; vgbm %v3, 3 +; vn %v24, %v24, %v3 +; br %r14 + +function %scalar_to_vector_i16x8_lane_7(i16x8) -> i16x8 wasmtime_system_v { +block0(v0: i16x8): + v1 = extractlane.i16x8 v0, 7 + v2 = scalar_to_vector.i16x8 v1 + return v2 +} + +; block0: +; vreph %v3, %v24, 0 +; vgbm %v5, 3 +; vn %v24, %v3, %v5 +; br %r14 + +function %scalar_to_vector_i16x8_mem(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = load.i16 v0 + v2 = scalar_to_vector.i16x8 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleh %v24, 0(%r2), 7 +; br %r14 + +function %scalar_to_vector_i16x8_mem_little(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = load.i16 little v0 + v2 = scalar_to_vector.i16x8 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; lrvh %r3, 0(%r2) +; vlvgh %v24, %r3, 7 +; br %r14 + +function %scalar_to_vector_i8x16(i8) -> i8x16 wasmtime_system_v { +block0(v0: i8): + v1 = scalar_to_vector.i8x16 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vlvgb %v24, %r2, 15 +; br %r14 + +function %scalar_to_vector_i8x16_imm() -> i8x16 wasmtime_system_v { +block0: + v0 = iconst.i8 123 + v1 = scalar_to_vector.i8x16 v0 + return v1 +} + +; block0: +; vgbm %v24, 0 +; vleib %v24, 123, 15 +; br %r14 + +function %scalar_to_vector_i8x16_lane_0(i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16): + v1 = extractlane.i8x16 v0, 0 + v2 = scalar_to_vector.i8x16 v1 + return v2 +} + +; block0: +; vgbm %v3, 1 +; vn %v24, %v24, %v3 +; br %r14 + +function %scalar_to_vector_i8x16_lane_15(i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16): + v1 = extractlane.i8x16 v0, 15 + v2 = scalar_to_vector.i8x16 v1 + return v2 +} + +; block0: +; vrepb %v3, %v24, 0 +; vgbm %v5, 1 +; vn %v24, %v3, %v5 +; br %r14 + +function %scalar_to_vector_i8x16_mem(i64) -> i8x16 wasmtime_system_v { +block0(v0: i64): + v1 = load.i8 v0 + v2 = scalar_to_vector.i8x16 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %scalar_to_vector_i8x16_mem_little(i64) -> i8x16 wasmtime_system_v { +block0(v0: i64): + v1 = load.i8 little v0 + v2 = scalar_to_vector.i8x16 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleb %v24, 0(%r2), 15 +; br %r14 + +function %scalar_to_vector_f64x2(f64) -> f64x2 wasmtime_system_v { +block0(v0: f64): + v1 = scalar_to_vector.f64x2 v0 + return v1 +} + +; block0: +; vgbm %v3, 0 +; vpdi %v24, %v3, %v0, 0 +; br %r14 + +function %scalar_to_vector_f64x2_lane_0(f64x2) -> f64x2 wasmtime_system_v { +block0(v0: f64x2): + v1 = extractlane.f64x2 v0, 0 + v2 = scalar_to_vector.f64x2 v1 + return v2 +} + +; block0: +; vgbm %v3, 0 +; vpdi %v24, %v3, %v24, 1 +; br %r14 + +function %scalar_to_vector_f64x2_lane_1(f64x2) -> f64x2 wasmtime_system_v { +block0(v0: f64x2): + v1 = extractlane.f64x2 v0, 1 + v2 = scalar_to_vector.f64x2 v1 + return v2 +} + +; block0: +; vgbm %v3, 0 +; vpdi %v24, %v3, %v24, 0 +; br %r14 + +function %scalar_to_vector_f64x2_mem(i64) -> f64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.f64 v0 + v2 = scalar_to_vector.f64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vleg %v24, 0(%r2), 1 +; br %r14 + +function %scalar_to_vector_f64x2_mem_little(i64) -> f64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.f64 little v0 + v2 = scalar_to_vector.f64x2 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; lrvg %r3, 0(%r2) +; vlvgg %v24, %r3, 1 +; br %r14 + +function %scalar_to_vector_f32x4(f32) -> f32x4 wasmtime_system_v { +block0(v0: f32): + v1 = scalar_to_vector.f32x4 v0 + return v1 +} + +; block0: +; vrepf %v3, %v0, 0 +; vgbm %v5, 15 +; vn %v24, %v3, %v5 +; br %r14 + +function %scalar_to_vector_f32x4_lane_0(f32x4) -> f32x4 wasmtime_system_v { +block0(v0: f32x4): + v1 = extractlane.f32x4 v0, 0 + v2 = scalar_to_vector.f32x4 v1 + return v2 +} + +; block0: +; vgbm %v3, 15 +; vn %v24, %v24, %v3 +; br %r14 + +function %scalar_to_vector_f32x4_lane_3(f32x4) -> f32x4 wasmtime_system_v { +block0(v0: f32x4): + v1 = extractlane.f32x4 v0, 3 + v2 = scalar_to_vector.f32x4 v1 + return v2 +} + +; block0: +; vrepf %v3, %v24, 0 +; vgbm %v5, 15 +; vn %v24, %v3, %v5 +; br %r14 + +function %scalar_to_vector_f32x4_mem(i64) -> f32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.f32 v0 + v2 = scalar_to_vector.f32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; vlef %v24, 0(%r2), 3 +; br %r14 + +function %scalar_to_vector_f32x4_mem_little(i64) -> f32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.f32 little v0 + v2 = scalar_to_vector.f32x4 v1 + return v2 +} + +; block0: +; vgbm %v24, 0 +; lrv %r3, 0(%r2) +; vlvgf %v24, %r3, 3 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-lane.clif b/cranelift/filetests/filetests/isa/s390x/vec-lane.clif index 7efa4e3b719a..fc64900f2980 100644 --- a/cranelift/filetests/filetests/isa/s390x/vec-lane.clif +++ b/cranelift/filetests/filetests/isa/s390x/vec-lane.clif @@ -8,7 +8,7 @@ block0(v0: i64x2, v1: i64): } ; block0: -; vlvgg %v24, %r2, 1 +; vlvgg %v24, %r2, 0 ; br %r14 function %insertlane_i64x2_1(i64x2, i64) -> i64x2 { @@ -18,7 +18,7 @@ block0(v0: i64x2, v1: i64): } ; block0: -; vlvgg %v24, %r2, 0 +; vlvgg %v24, %r2, 1 ; br %r14 function %insertlane_i64x2_imm_0(i64x2) -> i64x2 { @@ -29,7 +29,7 @@ block0(v0: i64x2): } ; block0: -; vleig %v24, 123, 1 +; vleig %v24, 123, 0 ; br %r14 function %insertlane_i64x2_imm_1(i64x2) -> i64x2 { @@ -40,7 +40,7 @@ block0(v0: i64x2): } ; block0: -; vleig %v24, 123, 0 +; vleig %v24, 123, 1 ; br %r14 function %insertlane_i64x2_lane_0_0(i64x2, i64x2) -> i64x2 { @@ -51,7 +51,7 @@ block0(v0: i64x2, v1: i64x2): } ; block0: -; vpdi %v24, %v24, %v25, 1 +; vpdi %v24, %v25, %v24, 1 ; br %r14 function %insertlane_i64x2_lane_0_1(i64x2, i64x2) -> i64x2 { @@ -62,7 +62,7 @@ block0(v0: i64x2, v1: i64x2): } ; block0: -; vpdi %v24, %v25, %v24, 5 +; vpdi %v24, %v24, %v25, 0 ; br %r14 function %insertlane_i64x2_lane_1_0(i64x2, i64x2) -> i64x2 { @@ -73,7 +73,7 @@ block0(v0: i64x2, v1: i64x2): } ; block0: -; vpdi %v24, %v24, %v25, 0 +; vpdi %v24, %v25, %v24, 5 ; br %r14 function %insertlane_i64x2_lane_1_1(i64x2, i64x2) -> i64x2 { @@ -84,7 +84,7 @@ block0(v0: i64x2, v1: i64x2): } ; block0: -; vpdi %v24, %v25, %v24, 1 +; vpdi %v24, %v24, %v25, 1 ; br %r14 function %insertlane_i64x2_mem_0(i64x2, i64) -> i64x2 { @@ -95,7 +95,7 @@ block0(v0: i64x2, v1: i64): } ; block0: -; vleg %v24, 0(%r2), 1 +; vleg %v24, 0(%r2), 0 ; br %r14 function %insertlane_i64x2_mem_1(i64x2, i64) -> i64x2 { @@ -106,7 +106,7 @@ block0(v0: i64x2, v1: i64): } ; block0: -; vleg %v24, 0(%r2), 0 +; vleg %v24, 0(%r2), 1 ; br %r14 function %insertlane_i64x2_mem_little_0(i64x2, i64) -> i64x2 { @@ -118,7 +118,7 @@ block0(v0: i64x2, v1: i64): ; block0: ; lrvg %r3, 0(%r2) -; vlvgg %v24, %r3, 1 +; vlvgg %v24, %r3, 0 ; br %r14 function %insertlane_i64x2_mem_little_1(i64x2, i64) -> i64x2 { @@ -130,7 +130,7 @@ block0(v0: i64x2, v1: i64): ; block0: ; lrvg %r3, 0(%r2) -; vlvgg %v24, %r3, 0 +; vlvgg %v24, %r3, 1 ; br %r14 function %insertlane_i32x4_0(i32x4, i32) -> i32x4 { @@ -140,7 +140,7 @@ block0(v0: i32x4, v1: i32): } ; block0: -; vlvgf %v24, %r2, 3 +; vlvgf %v24, %r2, 0 ; br %r14 function %insertlane_i32x4_3(i32x4, i32) -> i32x4 { @@ -150,7 +150,7 @@ block0(v0: i32x4, v1: i32): } ; block0: -; vlvgf %v24, %r2, 0 +; vlvgf %v24, %r2, 3 ; br %r14 function %insertlane_i32x4_imm_0(i32x4) -> i32x4 { @@ -161,7 +161,7 @@ block0(v0: i32x4): } ; block0: -; vleif %v24, 123, 3 +; vleif %v24, 123, 0 ; br %r14 function %insertlane_i32x4_imm_3(i32x4) -> i32x4 { @@ -172,7 +172,7 @@ block0(v0: i32x4): } ; block0: -; vleif %v24, 123, 0 +; vleif %v24, 123, 3 ; br %r14 function %insertlane_i32x4_lane_0_0(i32x4, i32x4) -> i32x4 { @@ -183,7 +183,7 @@ block0(v0: i32x4, v1: i32x4): } ; block0: -; vgbm %v5, 15 +; vgbm %v5, 61440 ; vsel %v24, %v25, %v24, %v5 ; br %r14 @@ -195,8 +195,8 @@ block0(v0: i32x4, v1: i32x4): } ; block0: -; vrepf %v5, %v25, 3 -; vgbm %v7, 61440 +; vrepf %v5, %v25, 0 +; vgbm %v7, 15 ; vsel %v24, %v5, %v24, %v7 ; br %r14 @@ -208,8 +208,8 @@ block0(v0: i32x4, v1: i32x4): } ; block0: -; vrepf %v5, %v25, 0 -; vgbm %v7, 15 +; vrepf %v5, %v25, 3 +; vgbm %v7, 61440 ; vsel %v24, %v5, %v24, %v7 ; br %r14 @@ -221,7 +221,7 @@ block0(v0: i32x4, v1: i32x4): } ; block0: -; vgbm %v5, 61440 +; vgbm %v5, 15 ; vsel %v24, %v25, %v24, %v5 ; br %r14 @@ -233,7 +233,7 @@ block0(v0: i32x4, v1: i64): } ; block0: -; vlef %v24, 0(%r2), 3 +; vlef %v24, 0(%r2), 0 ; br %r14 function %insertlane_i32x4_mem_3(i32x4, i64) -> i32x4 { @@ -244,7 +244,7 @@ block0(v0: i32x4, v1: i64): } ; block0: -; vlef %v24, 0(%r2), 0 +; vlef %v24, 0(%r2), 3 ; br %r14 function %insertlane_i32x4_mem_little_0(i32x4, i64) -> i32x4 { @@ -256,7 +256,7 @@ block0(v0: i32x4, v1: i64): ; block0: ; lrv %r3, 0(%r2) -; vlvgf %v24, %r3, 3 +; vlvgf %v24, %r3, 0 ; br %r14 function %insertlane_i32x4_mem_little_3(i32x4, i64) -> i32x4 { @@ -268,7 +268,7 @@ block0(v0: i32x4, v1: i64): ; block0: ; lrv %r3, 0(%r2) -; vlvgf %v24, %r3, 0 +; vlvgf %v24, %r3, 3 ; br %r14 function %insertlane_i16x8_0(i16x8, i16) -> i16x8 { @@ -278,7 +278,7 @@ block0(v0: i16x8, v1: i16): } ; block0: -; vlvgh %v24, %r2, 7 +; vlvgh %v24, %r2, 0 ; br %r14 function %insertlane_i16x8_7(i16x8, i16) -> i16x8 { @@ -288,7 +288,7 @@ block0(v0: i16x8, v1: i16): } ; block0: -; vlvgh %v24, %r2, 0 +; vlvgh %v24, %r2, 7 ; br %r14 function %insertlane_i16x8_imm_0(i16x8) -> i16x8 { @@ -299,7 +299,7 @@ block0(v0: i16x8): } ; block0: -; vleih %v24, 123, 7 +; vleih %v24, 123, 0 ; br %r14 function %insertlane_i16x8_imm_7(i16x8) -> i16x8 { @@ -310,7 +310,7 @@ block0(v0: i16x8): } ; block0: -; vleih %v24, 123, 0 +; vleih %v24, 123, 7 ; br %r14 function %insertlane_i16x8_lane_0_0(i16x8, i16x8) -> i16x8 { @@ -321,7 +321,7 @@ block0(v0: i16x8, v1: i16x8): } ; block0: -; vgbm %v5, 3 +; vgbm %v5, 49152 ; vsel %v24, %v25, %v24, %v5 ; br %r14 @@ -333,8 +333,8 @@ block0(v0: i16x8, v1: i16x8): } ; block0: -; vreph %v5, %v25, 7 -; vgbm %v7, 49152 +; vreph %v5, %v25, 0 +; vgbm %v7, 3 ; vsel %v24, %v5, %v24, %v7 ; br %r14 @@ -346,8 +346,8 @@ block0(v0: i16x8, v1: i16x8): } ; block0: -; vreph %v5, %v25, 0 -; vgbm %v7, 3 +; vreph %v5, %v25, 7 +; vgbm %v7, 49152 ; vsel %v24, %v5, %v24, %v7 ; br %r14 @@ -359,7 +359,7 @@ block0(v0: i16x8, v1: i16x8): } ; block0: -; vgbm %v5, 49152 +; vgbm %v5, 3 ; vsel %v24, %v25, %v24, %v5 ; br %r14 @@ -371,7 +371,7 @@ block0(v0: i16x8, v1: i64): } ; block0: -; vleh %v24, 0(%r2), 7 +; vleh %v24, 0(%r2), 0 ; br %r14 function %insertlane_i16x8_mem_7(i16x8, i64) -> i16x8 { @@ -382,7 +382,7 @@ block0(v0: i16x8, v1: i64): } ; block0: -; vleh %v24, 0(%r2), 0 +; vleh %v24, 0(%r2), 7 ; br %r14 function %insertlane_i16x8_mem_little_0(i16x8, i64) -> i16x8 { @@ -394,7 +394,7 @@ block0(v0: i16x8, v1: i64): ; block0: ; lrvh %r3, 0(%r2) -; vlvgh %v24, %r3, 7 +; vlvgh %v24, %r3, 0 ; br %r14 function %insertlane_i16x8_mem_little_7(i16x8, i64) -> i16x8 { @@ -406,7 +406,7 @@ block0(v0: i16x8, v1: i64): ; block0: ; lrvh %r3, 0(%r2) -; vlvgh %v24, %r3, 0 +; vlvgh %v24, %r3, 7 ; br %r14 function %insertlane_i8x16_0(i8x16, i8) -> i8x16 { @@ -416,7 +416,7 @@ block0(v0: i8x16, v1: i8): } ; block0: -; vlvgb %v24, %r2, 15 +; vlvgb %v24, %r2, 0 ; br %r14 function %insertlane_i8x16_15(i8x16, i8) -> i8x16 { @@ -426,7 +426,7 @@ block0(v0: i8x16, v1: i8): } ; block0: -; vlvgb %v24, %r2, 0 +; vlvgb %v24, %r2, 15 ; br %r14 function %insertlane_i8x16_imm_0(i8x16) -> i8x16 { @@ -437,7 +437,7 @@ block0(v0: i8x16): } ; block0: -; vleib %v24, 123, 15 +; vleib %v24, 123, 0 ; br %r14 function %insertlane_i8x16_imm_15(i8x16) -> i8x16 { @@ -448,7 +448,7 @@ block0(v0: i8x16): } ; block0: -; vleib %v24, 123, 0 +; vleib %v24, 123, 15 ; br %r14 function %insertlane_i8x16_lane_0_0(i8x16, i8x16) -> i8x16 { @@ -459,7 +459,7 @@ block0(v0: i8x16, v1: i8x16): } ; block0: -; vgbm %v5, 1 +; vgbm %v5, 32768 ; vsel %v24, %v25, %v24, %v5 ; br %r14 @@ -471,8 +471,8 @@ block0(v0: i8x16, v1: i8x16): } ; block0: -; vrepb %v5, %v25, 15 -; vgbm %v7, 32768 +; vrepb %v5, %v25, 0 +; vgbm %v7, 1 ; vsel %v24, %v5, %v24, %v7 ; br %r14 @@ -484,8 +484,8 @@ block0(v0: i8x16, v1: i8x16): } ; block0: -; vrepb %v5, %v25, 0 -; vgbm %v7, 1 +; vrepb %v5, %v25, 15 +; vgbm %v7, 32768 ; vsel %v24, %v5, %v24, %v7 ; br %r14 @@ -497,7 +497,7 @@ block0(v0: i8x16, v1: i8x16): } ; block0: -; vgbm %v5, 32768 +; vgbm %v5, 1 ; vsel %v24, %v25, %v24, %v5 ; br %r14 @@ -509,7 +509,7 @@ block0(v0: i8x16, v1: i64): } ; block0: -; vleb %v24, 0(%r2), 15 +; vleb %v24, 0(%r2), 0 ; br %r14 function %insertlane_i8x16_mem_15(i8x16, i64) -> i8x16 { @@ -520,7 +520,7 @@ block0(v0: i8x16, v1: i64): } ; block0: -; vleb %v24, 0(%r2), 0 +; vleb %v24, 0(%r2), 15 ; br %r14 function %insertlane_i8x16_mem_little_0(i8x16, i64) -> i8x16 { @@ -531,7 +531,7 @@ block0(v0: i8x16, v1: i64): } ; block0: -; vleb %v24, 0(%r2), 15 +; vleb %v24, 0(%r2), 0 ; br %r14 function %insertlane_i8x16_mem_little_15(i8x16, i64) -> i8x16 { @@ -542,7 +542,7 @@ block0(v0: i8x16, v1: i64): } ; block0: -; vleb %v24, 0(%r2), 0 +; vleb %v24, 0(%r2), 15 ; br %r14 function %insertlane_f64x2_0(f64x2, f64) -> f64x2 { @@ -552,7 +552,7 @@ block0(v0: f64x2, v1: f64): } ; block0: -; vpdi %v24, %v24, %v0, 0 +; vpdi %v24, %v0, %v24, 1 ; br %r14 function %insertlane_f64x2_1(f64x2, f64) -> f64x2 { @@ -562,7 +562,7 @@ block0(v0: f64x2, v1: f64): } ; block0: -; vpdi %v24, %v0, %v24, 1 +; vpdi %v24, %v24, %v0, 0 ; br %r14 function %insertlane_f64x2_lane_0_0(f64x2, f64x2) -> f64x2 { @@ -573,7 +573,7 @@ block0(v0: f64x2, v1: f64x2): } ; block0: -; vpdi %v24, %v24, %v25, 1 +; vpdi %v24, %v25, %v24, 1 ; br %r14 function %insertlane_f64x2_lane_0_1(f64x2, f64x2) -> f64x2 { @@ -584,7 +584,7 @@ block0(v0: f64x2, v1: f64x2): } ; block0: -; vpdi %v24, %v25, %v24, 5 +; vpdi %v24, %v24, %v25, 0 ; br %r14 function %insertlane_f64x2_lane_1_0(f64x2, f64x2) -> f64x2 { @@ -595,7 +595,7 @@ block0(v0: f64x2, v1: f64x2): } ; block0: -; vpdi %v24, %v24, %v25, 0 +; vpdi %v24, %v25, %v24, 5 ; br %r14 function %insertlane_f64x2_lane_1_1(f64x2, f64x2) -> f64x2 { @@ -606,7 +606,7 @@ block0(v0: f64x2, v1: f64x2): } ; block0: -; vpdi %v24, %v25, %v24, 1 +; vpdi %v24, %v24, %v25, 1 ; br %r14 function %insertlane_f64x2_mem_0(f64x2, i64) -> f64x2 { @@ -617,7 +617,7 @@ block0(v0: f64x2, v1: i64): } ; block0: -; vleg %v24, 0(%r2), 1 +; vleg %v24, 0(%r2), 0 ; br %r14 function %insertlane_f64x2_mem_1(f64x2, i64) -> f64x2 { @@ -628,7 +628,7 @@ block0(v0: f64x2, v1: i64): } ; block0: -; vleg %v24, 0(%r2), 0 +; vleg %v24, 0(%r2), 1 ; br %r14 function %insertlane_f64x2_mem_little_0(f64x2, i64) -> f64x2 { @@ -640,7 +640,7 @@ block0(v0: f64x2, v1: i64): ; block0: ; lrvg %r3, 0(%r2) -; vlvgg %v24, %r3, 1 +; vlvgg %v24, %r3, 0 ; br %r14 function %insertlane_f64x2_mem_little_1(f64x2, i64) -> f64x2 { @@ -652,7 +652,7 @@ block0(v0: f64x2, v1: i64): ; block0: ; lrvg %r3, 0(%r2) -; vlvgg %v24, %r3, 0 +; vlvgg %v24, %r3, 1 ; br %r14 function %insertlane_f32x4_0(f32x4, f32) -> f32x4 { @@ -662,9 +662,8 @@ block0(v0: f32x4, v1: f32): } ; block0: -; vrepf %v5, %v0, 0 -; vgbm %v7, 15 -; vsel %v24, %v5, %v24, %v7 +; vgbm %v5, 61440 +; vsel %v24, %v0, %v24, %v5 ; br %r14 function %insertlane_f32x4_3(f32x4, f32) -> f32x4 { @@ -674,8 +673,9 @@ block0(v0: f32x4, v1: f32): } ; block0: -; vgbm %v5, 61440 -; vsel %v24, %v0, %v24, %v5 +; vrepf %v5, %v0, 0 +; vgbm %v7, 15 +; vsel %v24, %v5, %v24, %v7 ; br %r14 function %insertlane_f32x4_lane_0_0(f32x4, f32x4) -> f32x4 { @@ -686,7 +686,7 @@ block0(v0: f32x4, v1: f32x4): } ; block0: -; vgbm %v5, 15 +; vgbm %v5, 61440 ; vsel %v24, %v25, %v24, %v5 ; br %r14 @@ -698,8 +698,8 @@ block0(v0: f32x4, v1: f32x4): } ; block0: -; vrepf %v5, %v25, 3 -; vgbm %v7, 61440 +; vrepf %v5, %v25, 0 +; vgbm %v7, 15 ; vsel %v24, %v5, %v24, %v7 ; br %r14 @@ -711,8 +711,8 @@ block0(v0: f32x4, v1: f32x4): } ; block0: -; vrepf %v5, %v25, 0 -; vgbm %v7, 15 +; vrepf %v5, %v25, 3 +; vgbm %v7, 61440 ; vsel %v24, %v5, %v24, %v7 ; br %r14 @@ -724,7 +724,7 @@ block0(v0: f32x4, v1: f32x4): } ; block0: -; vgbm %v5, 61440 +; vgbm %v5, 15 ; vsel %v24, %v25, %v24, %v5 ; br %r14 @@ -736,7 +736,7 @@ block0(v0: f32x4, v1: i64): } ; block0: -; vlef %v24, 0(%r2), 3 +; vlef %v24, 0(%r2), 0 ; br %r14 function %insertlane_i32x4_mem_3(i32x4, i64) -> i32x4 { @@ -747,7 +747,7 @@ block0(v0: i32x4, v1: i64): } ; block0: -; vlef %v24, 0(%r2), 0 +; vlef %v24, 0(%r2), 3 ; br %r14 function %insertlane_f32x4_mem_little_0(f32x4, i64) -> f32x4 { @@ -759,7 +759,7 @@ block0(v0: f32x4, v1: i64): ; block0: ; lrv %r3, 0(%r2) -; vlvgf %v24, %r3, 3 +; vlvgf %v24, %r3, 0 ; br %r14 function %insertlane_i32x4_mem_little_3(i32x4, i64) -> i32x4 { @@ -771,7 +771,7 @@ block0(v0: i32x4, v1: i64): ; block0: ; lrv %r3, 0(%r2) -; vlvgf %v24, %r3, 0 +; vlvgf %v24, %r3, 3 ; br %r14 function %extractlane_i64x2_0(i64x2) -> i64 { @@ -781,7 +781,7 @@ block0(v0: i64x2): } ; block0: -; vlgvg %r2, %v24, 1 +; vlgvg %r2, %v24, 0 ; br %r14 function %extractlane_i64x2_1(i64x2) -> i64 { @@ -791,7 +791,7 @@ block0(v0: i64x2): } ; block0: -; vlgvg %r2, %v24, 0 +; vlgvg %r2, %v24, 1 ; br %r14 function %extractlane_i64x2_mem_0(i64x2, i64) { @@ -802,7 +802,7 @@ block0(v0: i64x2, v1: i64): } ; block0: -; vsteg %v24, 0(%r2), 1 +; vsteg %v24, 0(%r2), 0 ; br %r14 function %extractlane_i64x2_mem_1(i64x2, i64) { @@ -813,7 +813,7 @@ block0(v0: i64x2, v1: i64): } ; block0: -; vsteg %v24, 0(%r2), 0 +; vsteg %v24, 0(%r2), 1 ; br %r14 function %extractlane_i64x2_mem_little_0(i64x2, i64) { @@ -824,7 +824,7 @@ block0(v0: i64x2, v1: i64): } ; block0: -; vlgvg %r3, %v24, 1 +; vlgvg %r3, %v24, 0 ; strvg %r3, 0(%r2) ; br %r14 @@ -836,7 +836,7 @@ block0(v0: i64x2, v1: i64): } ; block0: -; vlgvg %r3, %v24, 0 +; vlgvg %r3, %v24, 1 ; strvg %r3, 0(%r2) ; br %r14 @@ -847,7 +847,7 @@ block0(v0: i32x4): } ; block0: -; vlgvf %r2, %v24, 3 +; vlgvf %r2, %v24, 0 ; br %r14 function %extractlane_i32x4_3(i32x4) -> i32 { @@ -857,7 +857,7 @@ block0(v0: i32x4): } ; block0: -; vlgvf %r2, %v24, 0 +; vlgvf %r2, %v24, 3 ; br %r14 function %extractlane_i32x4_mem_0(i32x4, i64) { @@ -868,7 +868,7 @@ block0(v0: i32x4, v1: i64): } ; block0: -; vstef %v24, 0(%r2), 3 +; vstef %v24, 0(%r2), 0 ; br %r14 function %extractlane_i32x4_mem_3(i32x4, i64) { @@ -879,7 +879,7 @@ block0(v0: i32x4, v1: i64): } ; block0: -; vstef %v24, 0(%r2), 0 +; vstef %v24, 0(%r2), 3 ; br %r14 function %extractlane_i32x4_mem_little_0(i32x4, i64) { @@ -890,7 +890,7 @@ block0(v0: i32x4, v1: i64): } ; block0: -; vlgvf %r3, %v24, 3 +; vlgvf %r3, %v24, 0 ; strv %r3, 0(%r2) ; br %r14 @@ -902,7 +902,7 @@ block0(v0: i32x4, v1: i64): } ; block0: -; vlgvf %r3, %v24, 0 +; vlgvf %r3, %v24, 3 ; strv %r3, 0(%r2) ; br %r14 @@ -913,7 +913,7 @@ block0(v0: i16x8): } ; block0: -; vlgvh %r2, %v24, 7 +; vlgvh %r2, %v24, 0 ; br %r14 function %extractlane_i16x8_7(i16x8) -> i16 { @@ -923,7 +923,7 @@ block0(v0: i16x8): } ; block0: -; vlgvh %r2, %v24, 0 +; vlgvh %r2, %v24, 7 ; br %r14 function %extractlane_i16x8_mem_0(i16x8, i64) { @@ -934,7 +934,7 @@ block0(v0: i16x8, v1: i64): } ; block0: -; vsteh %v24, 0(%r2), 7 +; vsteh %v24, 0(%r2), 0 ; br %r14 function %extractlane_i16x8_mem_7(i16x8, i64) { @@ -945,7 +945,7 @@ block0(v0: i16x8, v1: i64): } ; block0: -; vsteh %v24, 0(%r2), 0 +; vsteh %v24, 0(%r2), 7 ; br %r14 function %extractlane_i16x8_mem_little_0(i16x8, i64) { @@ -956,7 +956,7 @@ block0(v0: i16x8, v1: i64): } ; block0: -; vlgvh %r3, %v24, 7 +; vlgvh %r3, %v24, 0 ; strvh %r3, 0(%r2) ; br %r14 @@ -968,7 +968,7 @@ block0(v0: i16x8, v1: i64): } ; block0: -; vlgvh %r3, %v24, 0 +; vlgvh %r3, %v24, 7 ; strvh %r3, 0(%r2) ; br %r14 @@ -979,7 +979,7 @@ block0(v0: i8x16): } ; block0: -; vlgvb %r2, %v24, 15 +; vlgvb %r2, %v24, 0 ; br %r14 function %extractlane_i8x16_15(i8x16) -> i8 { @@ -989,7 +989,7 @@ block0(v0: i8x16): } ; block0: -; vlgvb %r2, %v24, 0 +; vlgvb %r2, %v24, 15 ; br %r14 function %extractlane_i8x16_mem_0(i8x16, i64) { @@ -1000,7 +1000,7 @@ block0(v0: i8x16, v1: i64): } ; block0: -; vsteb %v24, 0(%r2), 15 +; vsteb %v24, 0(%r2), 0 ; br %r14 function %extractlane_i8x16_mem_15(i8x16, i64) { @@ -1011,7 +1011,7 @@ block0(v0: i8x16, v1: i64): } ; block0: -; vsteb %v24, 0(%r2), 0 +; vsteb %v24, 0(%r2), 15 ; br %r14 function %extractlane_i8x16_mem_little_0(i8x16, i64) { @@ -1022,7 +1022,7 @@ block0(v0: i8x16, v1: i64): } ; block0: -; vsteb %v24, 0(%r2), 15 +; vsteb %v24, 0(%r2), 0 ; br %r14 function %extractlane_i8x16_mem_little_15(i8x16, i64) { @@ -1033,7 +1033,7 @@ block0(v0: i8x16, v1: i64): } ; block0: -; vsteb %v24, 0(%r2), 0 +; vsteb %v24, 0(%r2), 15 ; br %r14 function %extractlane_f64x2_0(f64x2) -> f64 { @@ -1043,7 +1043,7 @@ block0(v0: f64x2): } ; block0: -; vrepg %v0, %v24, 1 +; vrepg %v0, %v24, 0 ; br %r14 function %extractlane_f64x2_1(f64x2) -> f64 { @@ -1053,7 +1053,7 @@ block0(v0: f64x2): } ; block0: -; vrepg %v0, %v24, 0 +; vrepg %v0, %v24, 1 ; br %r14 function %extractlane_f64x2_mem_0(f64x2, i64) { @@ -1064,7 +1064,7 @@ block0(v0: f64x2, v1: i64): } ; block0: -; vsteg %v24, 0(%r2), 1 +; vsteg %v24, 0(%r2), 0 ; br %r14 function %extractlane_f64x2_mem_1(f64x2, i64) { @@ -1075,7 +1075,7 @@ block0(v0: f64x2, v1: i64): } ; block0: -; vsteg %v24, 0(%r2), 0 +; vsteg %v24, 0(%r2), 1 ; br %r14 function %extractlane_f64x2_mem_little_0(f64x2, i64) { @@ -1086,7 +1086,7 @@ block0(v0: f64x2, v1: i64): } ; block0: -; vlgvg %r3, %v24, 1 +; vlgvg %r3, %v24, 0 ; strvg %r3, 0(%r2) ; br %r14 @@ -1098,7 +1098,7 @@ block0(v0: f64x2, v1: i64): } ; block0: -; vlgvg %r3, %v24, 0 +; vlgvg %r3, %v24, 1 ; strvg %r3, 0(%r2) ; br %r14 @@ -1109,7 +1109,7 @@ block0(v0: f32x4): } ; block0: -; vrepf %v0, %v24, 3 +; vrepf %v0, %v24, 0 ; br %r14 function %extractlane_f32x4_3(f32x4) -> f32 { @@ -1119,7 +1119,7 @@ block0(v0: f32x4): } ; block0: -; vrepf %v0, %v24, 0 +; vrepf %v0, %v24, 3 ; br %r14 function %extractlane_f32x4_mem_0(f32x4, i64) { @@ -1130,7 +1130,7 @@ block0(v0: f32x4, v1: i64): } ; block0: -; vstef %v24, 0(%r2), 3 +; vstef %v24, 0(%r2), 0 ; br %r14 function %extractlane_f32x4_mem_3(f32x4, i64) { @@ -1141,7 +1141,7 @@ block0(v0: f32x4, v1: i64): } ; block0: -; vstef %v24, 0(%r2), 0 +; vstef %v24, 0(%r2), 3 ; br %r14 function %extractlane_f32x4_mem_little_0(f32x4, i64) { @@ -1152,7 +1152,7 @@ block0(v0: f32x4, v1: i64): } ; block0: -; vlgvf %r3, %v24, 3 +; vlgvf %r3, %v24, 0 ; strv %r3, 0(%r2) ; br %r14 @@ -1164,7 +1164,7 @@ block0(v0: f32x4, v1: i64): } ; block0: -; vlgvf %r3, %v24, 0 +; vlgvf %r3, %v24, 3 ; strv %r3, 0(%r2) ; br %r14 @@ -1198,7 +1198,7 @@ block0(v0: i64x2): } ; block0: -; vrepg %v24, %v24, 1 +; vrepg %v24, %v24, 0 ; br %r14 function %splat_i64x2_lane_1(i64x2) -> i64x2 { @@ -1209,7 +1209,7 @@ block0(v0: i64x2): } ; block0: -; vrepg %v24, %v24, 0 +; vrepg %v24, %v24, 1 ; br %r14 function %splat_i64x2_mem(i64) -> i64x2 { @@ -1266,7 +1266,7 @@ block0(v0: i32x4): } ; block0: -; vrepf %v24, %v24, 3 +; vrepf %v24, %v24, 0 ; br %r14 function %splat_i32x4_lane_3(i32x4) -> i32x4 { @@ -1277,7 +1277,7 @@ block0(v0: i32x4): } ; block0: -; vrepf %v24, %v24, 0 +; vrepf %v24, %v24, 3 ; br %r14 function %splat_i32x4_mem(i64) -> i32x4 { @@ -1334,7 +1334,7 @@ block0(v0: i16x8): } ; block0: -; vreph %v24, %v24, 7 +; vreph %v24, %v24, 0 ; br %r14 function %splat_i16x8_lane_7(i16x8) -> i16x8 { @@ -1345,7 +1345,7 @@ block0(v0: i16x8): } ; block0: -; vreph %v24, %v24, 0 +; vreph %v24, %v24, 7 ; br %r14 function %splat_i16x8_mem(i64) -> i16x8 { @@ -1402,7 +1402,7 @@ block0(v0: i8x16): } ; block0: -; vrepb %v24, %v24, 15 +; vrepb %v24, %v24, 0 ; br %r14 function %splat_i8x16_lane_15(i8x16) -> i8x16 { @@ -1413,7 +1413,7 @@ block0(v0: i8x16): } ; block0: -; vrepb %v24, %v24, 0 +; vrepb %v24, %v24, 15 ; br %r14 function %splat_i8x16_mem(i64) -> i8x16 { @@ -1456,7 +1456,7 @@ block0(v0: f64x2): } ; block0: -; vrepg %v24, %v24, 1 +; vrepg %v24, %v24, 0 ; br %r14 function %splat_f64x2_lane_1(f64x2) -> f64x2 { @@ -1467,7 +1467,7 @@ block0(v0: f64x2): } ; block0: -; vrepg %v24, %v24, 0 +; vrepg %v24, %v24, 1 ; br %r14 function %splat_f64x2_mem(i64) -> f64x2 { @@ -1512,7 +1512,7 @@ block0(v0: f32x4): } ; block0: -; vrepf %v24, %v24, 3 +; vrepf %v24, %v24, 0 ; br %r14 function %splat_i32x4_lane_3(i32x4) -> i32x4 { @@ -1523,7 +1523,7 @@ block0(v0: i32x4): } ; block0: -; vrepf %v24, %v24, 0 +; vrepf %v24, %v24, 3 ; br %r14 function %splat_f32x4_mem(i64) -> f32x4 { @@ -1558,7 +1558,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vlvgg %v24, %r2, 1 +; vlvgg %v24, %r2, 0 ; br %r14 function %scalar_to_vector_i64x2_imm() -> i64x2 { @@ -1570,7 +1570,7 @@ block0: ; block0: ; vgbm %v24, 0 -; vleig %v24, 123, 1 +; vleig %v24, 123, 0 ; br %r14 function %scalar_to_vector_i64x2_lane_0(i64x2) -> i64x2 { @@ -1582,7 +1582,7 @@ block0(v0: i64x2): ; block0: ; vgbm %v3, 0 -; vpdi %v24, %v3, %v24, 1 +; vpdi %v24, %v24, %v3, 0 ; br %r14 function %scalar_to_vector_i64x2_lane_1(i64x2) -> i64x2 { @@ -1594,7 +1594,7 @@ block0(v0: i64x2): ; block0: ; vgbm %v3, 0 -; vpdi %v24, %v3, %v24, 0 +; vpdi %v24, %v24, %v3, 4 ; br %r14 function %scalar_to_vector_i64x2_mem(i64) -> i64x2 { @@ -1606,7 +1606,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vleg %v24, 0(%r2), 1 +; vleg %v24, 0(%r2), 0 ; br %r14 function %scalar_to_vector_i64x2_mem_little(i64) -> i64x2 { @@ -1619,7 +1619,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 ; lrvg %r3, 0(%r2) -; vlvgg %v24, %r3, 1 +; vlvgg %v24, %r3, 0 ; br %r14 function %scalar_to_vector_i32x4(i32) -> i32x4 { @@ -1630,7 +1630,7 @@ block0(v0: i32): ; block0: ; vgbm %v24, 0 -; vlvgf %v24, %r2, 3 +; vlvgf %v24, %r2, 0 ; br %r14 function %scalar_to_vector_i32x4_imm() -> i32x4 { @@ -1642,7 +1642,7 @@ block0: ; block0: ; vgbm %v24, 0 -; vleif %v24, 123, 3 +; vleif %v24, 123, 0 ; br %r14 function %scalar_to_vector_i32x4_lane_0(i32x4) -> i32x4 { @@ -1653,7 +1653,7 @@ block0(v0: i32x4): } ; block0: -; vgbm %v3, 15 +; vgbm %v3, 61440 ; vn %v24, %v24, %v3 ; br %r14 @@ -1665,8 +1665,8 @@ block0(v0: i32x4): } ; block0: -; vrepf %v3, %v24, 0 -; vgbm %v5, 15 +; vrepf %v3, %v24, 3 +; vgbm %v5, 61440 ; vn %v24, %v3, %v5 ; br %r14 @@ -1679,7 +1679,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vlef %v24, 0(%r2), 3 +; vlef %v24, 0(%r2), 0 ; br %r14 function %scalar_to_vector_i32x4_mem_little(i64) -> i32x4 { @@ -1692,7 +1692,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 ; lrv %r3, 0(%r2) -; vlvgf %v24, %r3, 3 +; vlvgf %v24, %r3, 0 ; br %r14 function %scalar_to_vector_i16x8(i16) -> i16x8 { @@ -1703,7 +1703,7 @@ block0(v0: i16): ; block0: ; vgbm %v24, 0 -; vlvgh %v24, %r2, 7 +; vlvgh %v24, %r2, 0 ; br %r14 function %scalar_to_vector_i16x8_imm() -> i16x8 { @@ -1715,7 +1715,7 @@ block0: ; block0: ; vgbm %v24, 0 -; vleih %v24, 123, 7 +; vleih %v24, 123, 0 ; br %r14 function %scalar_to_vector_i16x8_lane_0(i16x8) -> i16x8 { @@ -1726,7 +1726,7 @@ block0(v0: i16x8): } ; block0: -; vgbm %v3, 3 +; vgbm %v3, 49152 ; vn %v24, %v24, %v3 ; br %r14 @@ -1738,8 +1738,8 @@ block0(v0: i16x8): } ; block0: -; vreph %v3, %v24, 0 -; vgbm %v5, 3 +; vreph %v3, %v24, 7 +; vgbm %v5, 49152 ; vn %v24, %v3, %v5 ; br %r14 @@ -1752,7 +1752,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vleh %v24, 0(%r2), 7 +; vleh %v24, 0(%r2), 0 ; br %r14 function %scalar_to_vector_i16x8_mem_little(i64) -> i16x8 { @@ -1765,7 +1765,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 ; lrvh %r3, 0(%r2) -; vlvgh %v24, %r3, 7 +; vlvgh %v24, %r3, 0 ; br %r14 function %scalar_to_vector_i8x16(i8) -> i8x16 { @@ -1776,7 +1776,7 @@ block0(v0: i8): ; block0: ; vgbm %v24, 0 -; vlvgb %v24, %r2, 15 +; vlvgb %v24, %r2, 0 ; br %r14 function %scalar_to_vector_i8x16_imm() -> i8x16 { @@ -1788,7 +1788,7 @@ block0: ; block0: ; vgbm %v24, 0 -; vleib %v24, 123, 15 +; vleib %v24, 123, 0 ; br %r14 function %scalar_to_vector_i8x16_lane_0(i8x16) -> i8x16 { @@ -1799,7 +1799,7 @@ block0(v0: i8x16): } ; block0: -; vgbm %v3, 1 +; vgbm %v3, 32768 ; vn %v24, %v24, %v3 ; br %r14 @@ -1811,8 +1811,8 @@ block0(v0: i8x16): } ; block0: -; vrepb %v3, %v24, 0 -; vgbm %v5, 1 +; vrepb %v3, %v24, 15 +; vgbm %v5, 32768 ; vn %v24, %v3, %v5 ; br %r14 @@ -1825,7 +1825,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vleb %v24, 0(%r2), 15 +; vleb %v24, 0(%r2), 0 ; br %r14 function %scalar_to_vector_i8x16_mem_little(i64) -> i8x16 { @@ -1837,7 +1837,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vleb %v24, 0(%r2), 15 +; vleb %v24, 0(%r2), 0 ; br %r14 function %scalar_to_vector_f64x2(f64) -> f64x2 { @@ -1848,7 +1848,7 @@ block0(v0: f64): ; block0: ; vgbm %v3, 0 -; vpdi %v24, %v3, %v0, 0 +; vpdi %v24, %v0, %v3, 0 ; br %r14 function %scalar_to_vector_f64x2_lane_0(f64x2) -> f64x2 { @@ -1860,7 +1860,7 @@ block0(v0: f64x2): ; block0: ; vgbm %v3, 0 -; vpdi %v24, %v3, %v24, 1 +; vpdi %v24, %v24, %v3, 0 ; br %r14 function %scalar_to_vector_f64x2_lane_1(f64x2) -> f64x2 { @@ -1872,7 +1872,7 @@ block0(v0: f64x2): ; block0: ; vgbm %v3, 0 -; vpdi %v24, %v3, %v24, 0 +; vpdi %v24, %v24, %v3, 4 ; br %r14 function %scalar_to_vector_f64x2_mem(i64) -> f64x2 { @@ -1884,7 +1884,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vleg %v24, 0(%r2), 1 +; vleg %v24, 0(%r2), 0 ; br %r14 function %scalar_to_vector_f64x2_mem_little(i64) -> f64x2 { @@ -1897,7 +1897,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 ; lrvg %r3, 0(%r2) -; vlvgg %v24, %r3, 1 +; vlvgg %v24, %r3, 0 ; br %r14 function %scalar_to_vector_f32x4(f32) -> f32x4 { @@ -1907,9 +1907,8 @@ block0(v0: f32): } ; block0: -; vrepf %v3, %v0, 0 -; vgbm %v5, 15 -; vn %v24, %v3, %v5 +; vgbm %v3, 61440 +; vn %v24, %v0, %v3 ; br %r14 function %scalar_to_vector_f32x4_lane_0(f32x4) -> f32x4 { @@ -1920,7 +1919,7 @@ block0(v0: f32x4): } ; block0: -; vgbm %v3, 15 +; vgbm %v3, 61440 ; vn %v24, %v24, %v3 ; br %r14 @@ -1932,8 +1931,8 @@ block0(v0: f32x4): } ; block0: -; vrepf %v3, %v24, 0 -; vgbm %v5, 15 +; vrepf %v3, %v24, 3 +; vgbm %v5, 61440 ; vn %v24, %v3, %v5 ; br %r14 @@ -1946,7 +1945,7 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 -; vlef %v24, 0(%r2), 3 +; vlef %v24, 0(%r2), 0 ; br %r14 function %scalar_to_vector_f32x4_mem_little(i64) -> f32x4 { @@ -1959,6 +1958,6 @@ block0(v0: i64): ; block0: ; vgbm %v24, 0 ; lrv %r3, 0(%r2) -; vlvgf %v24, %r3, 3 +; vlvgf %v24, %r3, 0 ; br %r14 diff --git a/cranelift/filetests/filetests/isa/s390x/vec-logical.clif b/cranelift/filetests/filetests/isa/s390x/vec-logical.clif index b0375f81dc10..d692a14c4a57 100644 --- a/cranelift/filetests/filetests/isa/s390x/vec-logical.clif +++ b/cranelift/filetests/filetests/isa/s390x/vec-logical.clif @@ -625,7 +625,55 @@ block0(v0: f64x2, v1: f64x2): ; lochio %r2, 1 ; br %r14 -function %vhigh_bits(i64x2) -> i64 { +function %vhigh_bits_be(i64x2) -> i64 { +block0(v0: i64x2): + v1 = vhigh_bits.i64 v0 + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x80808080808080808080808080804000 ; vl %v3, 0(%r1) +; vbperm %v5, %v24, %v3 +; lgdr %r2, %f5 +; br %r14 + +function %vhigh_bits_be(i32x4) -> i64 { +block0(v0: i32x4): + v1 = vhigh_bits.i64 v0 + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x80808080808080808080808060402000 ; vl %v3, 0(%r1) +; vbperm %v5, %v24, %v3 +; lgdr %r2, %f5 +; br %r14 + +function %vhigh_bits_be(i16x8) -> i64 { +block0(v0: i16x8): + v1 = vhigh_bits.i64 v0 + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x80808080808080807060504030201000 ; vl %v3, 0(%r1) +; vbperm %v5, %v24, %v3 +; lgdr %r2, %f5 +; br %r14 + +function %vhigh_bits_be(i8x16) -> i64 { +block0(v0: i8x16): + v1 = vhigh_bits.i64 v0 + return v1 +} + +; block0: +; bras %r1, 20 ; data.u128 0x78706860585048403830282018100800 ; vl %v3, 0(%r1) +; vbperm %v5, %v24, %v3 +; lgdr %r2, %f5 +; br %r14 + +function %vhigh_bits_le(i64x2) -> i64 wasmtime_system_v { block0(v0: i64x2): v1 = vhigh_bits.i64 v0 return v1 @@ -637,7 +685,7 @@ block0(v0: i64x2): ; lgdr %r2, %f5 ; br %r14 -function %vhigh_bits(i32x4) -> i64 { +function %vhigh_bits_le(i32x4) -> i64 wasmtime_system_v { block0(v0: i32x4): v1 = vhigh_bits.i64 v0 return v1 @@ -649,7 +697,7 @@ block0(v0: i32x4): ; lgdr %r2, %f5 ; br %r14 -function %vhigh_bits(i16x8) -> i64 { +function %vhigh_bits_le(i16x8) -> i64 wasmtime_system_v { block0(v0: i16x8): v1 = vhigh_bits.i64 v0 return v1 @@ -661,7 +709,7 @@ block0(v0: i16x8): ; lgdr %r2, %f5 ; br %r14 -function %vhigh_bits(i8x16) -> i64 { +function %vhigh_bits_le(i8x16) -> i64 wasmtime_system_v { block0(v0: i8x16): v1 = vhigh_bits.i64 v0 return v1 diff --git a/cranelift/filetests/filetests/isa/s390x/vec-permute-le-lane.clif b/cranelift/filetests/filetests/isa/s390x/vec-permute-le-lane.clif new file mode 100644 index 000000000000..1c2ef9515169 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-permute-le-lane.clif @@ -0,0 +1,493 @@ +test compile precise-output +target s390x + +function %swizzle(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = swizzle.i8x16 v0, v1 + return v2 +} + +; block0: +; vgbm %v5, 0 +; vrepib %v7, 239 +; vno %v17, %v25, %v25 +; vmxlb %v19, %v7, %v17 +; vperm %v24, %v5, %v24, %v19 +; br %r14 + +function %shuffle_0(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + return v2 +} + +; block0: +; vrepib %v5, 15 +; vperm %v24, %v24, %v25, %v5 +; br %r14 + +function %shuffle_1(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [3 0 31 26 4 6 12 11 23 13 24 4 2 15 17 5] + return v2 +} + +; block0: +; bras %r1, 20 ; data.u128 0x0a1e000d0b1702180403090b15100f0c ; vl %v5, 0(%r1) +; vperm %v24, %v24, %v25, %v5 +; br %r14 + +function %shuffle_2(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47] + return v2 +} + +; block0: +; vgbm %v5, 1 +; bras %r1, 20 ; data.u128 0x8080808080808080808080808080800f ; vl %v7, 0(%r1) +; vperm %v17, %v24, %v25, %v7 +; vn %v24, %v5, %v17 +; br %r14 + +function %shuffle_vmrhg_xy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 25 26 27 28 29 30 31 8 9 10 11 12 13 14 15] + return v2 +} + +; block0: +; vmrhg %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrhf_xy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 25 26 27 8 9 10 11 28 29 30 31 12 13 14 15] + return v2 +} + +; block0: +; vmrhf %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrhh_xy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 25 8 9 26 27 10 11 28 29 12 13 30 31 14 15] + return v2 +} + +; block0: +; vmrhh %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrhb_xy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 8 25 9 26 10 27 11 28 12 29 13 30 14 31 15] + return v2 +} + +; block0: +; vmrhb %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrhg_yx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31] + return v2 +} + +; block0: +; vmrhg %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrhf_yx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31] + return v2 +} + +; block0: +; vmrhf %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrhh_yx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31] + return v2 +} + +; block0: +; vmrhh %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrhb_yx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 24 9 25 10 26 11 27 12 28 13 29 14 30 15 31] + return v2 +} + +; block0: +; vmrhb %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrhg_xx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15] + return v2 +} + +; block0: +; vmrhg %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrhf_xx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 9 10 11 8 9 10 11 12 13 14 15 12 13 14 15] + return v2 +} + +; block0: +; vmrhf %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrhh_xx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 9 8 9 10 11 10 11 12 13 12 13 14 15 14 15] + return v2 +} + +; block0: +; vmrhh %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrhb_xx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15] + return v2 +} + +; block0: +; vmrhb %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrhg_yy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 25 26 27 28 29 30 31 24 25 26 27 28 29 30 31] + return v2 +} + +; block0: +; vmrhg %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrhf_yy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 25 26 27 24 25 26 27 28 29 30 31 28 29 30 31] + return v2 +} + +; block0: +; vmrhf %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrhh_yy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 25 24 25 26 27 26 27 28 29 28 29 30 31 30 31] + return v2 +} + +; block0: +; vmrhh %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrhb_yy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [24 24 25 25 26 26 27 27 28 28 29 29 30 30 31 31] + return v2 +} + +; block0: +; vmrhb %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrlg_xy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 20 21 22 23 0 1 2 3 4 5 6 7] + return v2 +} + +; block0: +; vmrlg %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrlf_xy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 0 1 2 3 20 21 22 23 4 5 6 7] + return v2 +} + +; block0: +; vmrlf %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrlh_xy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 0 1 18 19 2 3 20 21 4 5 22 23 6 7] + return v2 +} + +; block0: +; vmrlh %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrlb_xy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 0 17 1 18 2 19 3 20 4 21 5 22 6 23 7] + return v2 +} + +; block0: +; vmrlb %v24, %v24, %v25 +; br %r14 + +function %shuffle_vmrlg_yx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23] + return v2 +} + +; block0: +; vmrlg %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrlf_yx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23] + return v2 +} + +; block0: +; vmrlf %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrlh_yx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23] + return v2 +} + +; block0: +; vmrlh %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrlb_yx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23] + return v2 +} + +; block0: +; vmrlb %v24, %v25, %v24 +; br %r14 + +function %shuffle_vmrlg_xx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7] + return v2 +} + +; block0: +; vmrlg %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrlf_xx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 0 1 2 3 4 5 6 7 4 5 6 7] + return v2 +} + +; block0: +; vmrlf %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrlh_xx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 0 1 2 3 2 3 4 5 4 5 6 7 6 7] + return v2 +} + +; block0: +; vmrlh %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrlb_xx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7] + return v2 +} + +; block0: +; vmrlb %v24, %v24, %v24 +; br %r14 + +function %shuffle_vmrlg_yy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 20 21 22 23 16 17 18 19 20 21 22 23] + return v2 +} + +; block0: +; vmrlg %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrlf_yy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 16 17 18 19 20 21 22 23 20 21 22 23] + return v2 +} + +; block0: +; vmrlf %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrlh_yy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 16 17 18 19 18 19 20 21 20 21 22 23 22 23] + return v2 +} + +; block0: +; vmrlh %v24, %v25, %v25 +; br %r14 + +function %shuffle_vmrlb_yy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 16 17 17 18 18 19 19 20 20 21 21 22 22 23 23] + return v2 +} + +; block0: +; vmrlb %v24, %v25, %v25 +; br %r14 + +;; Special patterns that can be implemented via PACK. +function %shuffle_vpkg_xy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 24 25 26 27 0 1 2 3 8 9 10 11] + return v2 +} + +; block0: +; vpkg %v24, %v24, %v25 +; br %r14 + +function %shuffle_vpkf_xy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 20 21 24 25 28 29 0 1 4 5 8 9 12 13] + return v2 +} + +; block0: +; vpkf %v24, %v24, %v25 +; br %r14 + +function %shuffle_vpkh_xy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 18 20 22 24 26 28 30 0 2 4 6 8 10 12 14] + return v2 +} + +; block0: +; vpkh %v24, %v24, %v25 +; br %r14 + +function %shuffle_vpkg_yx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27] + return v2 +} + +; block0: +; vpkg %v24, %v25, %v24 +; br %r14 + +function %shuffle_vpkf_yx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 4 5 8 9 12 13 16 17 20 21 24 25 28 29] + return v2 +} + +; block0: +; vpkf %v24, %v25, %v24 +; br %r14 + +function %shuffle_vpkh_yx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30] + return v2 +} + +; block0: +; vpkh %v24, %v25, %v24 +; br %r14 + +function %shuffle_vpkg_xx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11] + return v2 +} + +; block0: +; vpkg %v24, %v24, %v24 +; br %r14 + +function %shuffle_vpkf_xx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 1 4 5 8 9 12 13 0 1 4 5 8 9 12 13] + return v2 +} + +; block0: +; vpkf %v24, %v24, %v24 +; br %r14 + +function %shuffle_vpkh_xx(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [0 2 4 6 8 10 12 14 0 2 4 6 8 10 12 14] + return v2 +} + +; block0: +; vpkh %v24, %v24, %v24 +; br %r14 + +function %shuffle_vpkg_yy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 24 25 26 27 16 17 18 19 24 25 26 27] + return v2 +} + +; block0: +; vpkg %v24, %v25, %v25 +; br %r14 + +function %shuffle_vpkf_yy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 17 20 21 24 25 28 29 16 17 20 21 24 25 28 29] + return v2 +} + +; block0: +; vpkf %v24, %v25, %v25 +; br %r14 + +function %shuffle_vpkh_yy(i8x16, i8x16) -> i8x16 wasmtime_system_v { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle.i8x16 v0, v1, [16 18 20 22 24 26 28 30 16 18 20 22 24 26 28 30] + return v2 +} + +; block0: +; vpkh %v24, %v25, %v25 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-permute.clif b/cranelift/filetests/filetests/isa/s390x/vec-permute.clif index 4e5f7019c5a4..07dac2a9d027 100644 --- a/cranelift/filetests/filetests/isa/s390x/vec-permute.clif +++ b/cranelift/filetests/filetests/isa/s390x/vec-permute.clif @@ -9,10 +9,9 @@ block0(v0: i8x16, v1: i8x16): ; block0: ; vgbm %v5, 0 -; vrepib %v7, 239 -; vno %v17, %v25, %v25 -; vmxlb %v19, %v7, %v17 -; vperm %v24, %v5, %v24, %v19 +; vrepib %v7, 16 +; vmnlb %v17, %v7, %v25 +; vperm %v24, %v24, %v5, %v17 ; br %r14 function %shuffle_0(i8x16, i8x16) -> i8x16 { @@ -22,7 +21,7 @@ block0(v0: i8x16, v1: i8x16): } ; block0: -; vrepib %v5, 15 +; vgbm %v5, 0 ; vperm %v24, %v24, %v25, %v5 ; br %r14 @@ -33,7 +32,7 @@ block0(v0: i8x16, v1: i8x16): } ; block0: -; bras %r1, 20 ; data.u128 0x0a1e000d0b1702180403090b15100f0c ; vl %v5, 0(%r1) +; bras %r1, 20 ; data.u128 0x03001f1a04060c0b170d1804020f1105 ; vl %v5, 0(%r1) ; vperm %v24, %v24, %v25, %v5 ; br %r14 @@ -44,15 +43,15 @@ block0(v0: i8x16, v1: i8x16): } ; block0: -; vgbm %v5, 1 -; bras %r1, 20 ; data.u128 0x8080808080808080808080808080800f ; vl %v7, 0(%r1) +; vgbm %v5, 32768 +; bras %r1, 20 ; data.u128 0x00808080808080808080808080808080 ; vl %v7, 0(%r1) ; vperm %v17, %v24, %v25, %v7 ; vn %v24, %v5, %v17 ; br %r14 function %shuffle_vmrhg_xy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [24 25 26 27 28 29 30 31 8 9 10 11 12 13 14 15] + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23] return v2 } @@ -62,7 +61,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrhf_xy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [24 25 26 27 8 9 10 11 28 29 30 31 12 13 14 15] + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23] return v2 } @@ -72,7 +71,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrhh_xy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [24 25 8 9 26 27 10 11 28 29 12 13 30 31 14 15] + v2 = shuffle.i8x16 v0, v1, [0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23] return v2 } @@ -82,7 +81,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrhb_xy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [24 8 25 9 26 10 27 11 28 12 29 13 30 14 31 15] + v2 = shuffle.i8x16 v0, v1, [0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23] return v2 } @@ -92,7 +91,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrhg_yx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31] + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 20 21 22 23 0 1 2 3 4 5 6 7] return v2 } @@ -102,7 +101,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrhf_yx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31] + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 0 1 2 3 20 21 22 23 4 5 6 7] return v2 } @@ -112,7 +111,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrhh_yx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31] + v2 = shuffle.i8x16 v0, v1, [16 17 0 1 18 19 2 3 20 21 4 5 22 23 6 7] return v2 } @@ -122,7 +121,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrhb_yx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [8 24 9 25 10 26 11 27 12 28 13 29 14 30 15 31] + v2 = shuffle.i8x16 v0, v1, [16 0 17 1 18 2 19 3 20 4 21 5 22 6 23 7] return v2 } @@ -132,7 +131,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrhg_xx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15] + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7] return v2 } @@ -142,7 +141,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrhf_xx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [8 9 10 11 8 9 10 11 12 13 14 15 12 13 14 15] + v2 = shuffle.i8x16 v0, v1, [0 1 2 3 0 1 2 3 4 5 6 7 4 5 6 7] return v2 } @@ -152,7 +151,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrhh_xx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [8 9 8 9 10 11 10 11 12 13 12 13 14 15 14 15] + v2 = shuffle.i8x16 v0, v1, [0 1 0 1 2 3 2 3 4 5 4 5 6 7 6 7] return v2 } @@ -162,7 +161,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrhb_xx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15] + v2 = shuffle.i8x16 v0, v1, [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7] return v2 } @@ -172,7 +171,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrhg_yy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [24 25 26 27 28 29 30 31 24 25 26 27 28 29 30 31] + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 20 21 22 23 16 17 18 19 20 21 22 23] return v2 } @@ -182,7 +181,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrhf_yy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [24 25 26 27 24 25 26 27 28 29 30 31 28 29 30 31] + v2 = shuffle.i8x16 v0, v1, [16 17 18 19 16 17 18 19 20 21 22 23 20 21 22 23] return v2 } @@ -192,7 +191,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrhh_yy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [24 25 24 25 26 27 26 27 28 29 28 29 30 31 30 31] + v2 = shuffle.i8x16 v0, v1, [16 17 16 17 18 19 18 19 20 21 20 21 22 23 22 23] return v2 } @@ -202,7 +201,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrhb_yy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [24 24 25 25 26 26 27 27 28 28 29 29 30 30 31 31] + v2 = shuffle.i8x16 v0, v1, [16 16 17 17 18 18 19 19 20 20 21 21 22 22 23 23] return v2 } @@ -212,7 +211,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrlg_xy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [16 17 18 19 20 21 22 23 0 1 2 3 4 5 6 7] + v2 = shuffle.i8x16 v0, v1, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31] return v2 } @@ -222,7 +221,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrlf_xy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [16 17 18 19 0 1 2 3 20 21 22 23 4 5 6 7] + v2 = shuffle.i8x16 v0, v1, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31] return v2 } @@ -232,7 +231,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrlh_xy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [16 17 0 1 18 19 2 3 20 21 4 5 22 23 6 7] + v2 = shuffle.i8x16 v0, v1, [8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31] return v2 } @@ -242,7 +241,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrlb_xy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [16 0 17 1 18 2 19 3 20 4 21 5 22 6 23 7] + v2 = shuffle.i8x16 v0, v1, [8 24 9 25 10 26 11 27 12 28 13 29 14 30 15 31] return v2 } @@ -252,7 +251,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrlg_yx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23] + v2 = shuffle.i8x16 v0, v1, [24 25 26 27 28 29 30 31 8 9 10 11 12 13 14 15] return v2 } @@ -262,7 +261,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrlf_yx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23] + v2 = shuffle.i8x16 v0, v1, [24 25 26 27 8 9 10 11 28 29 30 31 12 13 14 15] return v2 } @@ -272,7 +271,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrlh_yx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23] + v2 = shuffle.i8x16 v0, v1, [24 25 8 9 26 27 10 11 28 29 12 13 30 31 14 15] return v2 } @@ -282,7 +281,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrlb_yx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23] + v2 = shuffle.i8x16 v0, v1, [24 8 25 9 26 10 27 11 28 12 29 13 30 14 31 15] return v2 } @@ -292,7 +291,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrlg_xx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7] + v2 = shuffle.i8x16 v0, v1, [8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15] return v2 } @@ -302,7 +301,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrlf_xx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [0 1 2 3 0 1 2 3 4 5 6 7 4 5 6 7] + v2 = shuffle.i8x16 v0, v1, [8 9 10 11 8 9 10 11 12 13 14 15 12 13 14 15] return v2 } @@ -312,7 +311,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrlh_xx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [0 1 0 1 2 3 2 3 4 5 4 5 6 7 6 7] + v2 = shuffle.i8x16 v0, v1, [8 9 8 9 10 11 10 11 12 13 12 13 14 15 14 15] return v2 } @@ -322,7 +321,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrlb_xx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7] + v2 = shuffle.i8x16 v0, v1, [8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15] return v2 } @@ -332,7 +331,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrlg_yy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [16 17 18 19 20 21 22 23 16 17 18 19 20 21 22 23] + v2 = shuffle.i8x16 v0, v1, [24 25 26 27 28 29 30 31 24 25 26 27 28 29 30 31] return v2 } @@ -342,7 +341,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrlf_yy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [16 17 18 19 16 17 18 19 20 21 22 23 20 21 22 23] + v2 = shuffle.i8x16 v0, v1, [24 25 26 27 24 25 26 27 28 29 30 31 28 29 30 31] return v2 } @@ -352,7 +351,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrlh_yy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [16 17 16 17 18 19 18 19 20 21 20 21 22 23 22 23] + v2 = shuffle.i8x16 v0, v1, [24 25 24 25 26 27 26 27 28 29 28 29 30 31 30 31] return v2 } @@ -362,7 +361,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vmrlb_yy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [16 16 17 17 18 18 19 19 20 20 21 21 22 22 23 23] + v2 = shuffle.i8x16 v0, v1, [24 24 25 25 26 26 27 27 28 28 29 29 30 30 31 31] return v2 } @@ -373,7 +372,7 @@ block0(v0: i8x16, v1: i8x16): ;; Special patterns that can be implemented via PACK. function %shuffle_vpkg_xy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [16 17 18 19 24 25 26 27 0 1 2 3 8 9 10 11] + v2 = shuffle.i8x16 v0, v1, [4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31] return v2 } @@ -383,7 +382,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vpkf_xy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [16 17 20 21 24 25 28 29 0 1 4 5 8 9 12 13] + v2 = shuffle.i8x16 v0, v1, [2 3 6 7 10 11 14 15 18 19 22 23 26 27 30 31] return v2 } @@ -393,7 +392,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vpkh_xy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [16 18 20 22 24 26 28 30 0 2 4 6 8 10 12 14] + v2 = shuffle.i8x16 v0, v1, [1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31] return v2 } @@ -403,7 +402,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vpkg_yx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27] + v2 = shuffle.i8x16 v0, v1, [20 21 22 23 28 29 30 31 4 5 6 7 12 13 14 15] return v2 } @@ -413,7 +412,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vpkf_yx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [0 1 4 5 8 9 12 13 16 17 20 21 24 25 28 29] + v2 = shuffle.i8x16 v0, v1, [18 19 22 23 26 27 30 31 2 3 6 7 10 11 14 15] return v2 } @@ -423,7 +422,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vpkh_yx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30] + v2 = shuffle.i8x16 v0, v1, [17 19 21 23 25 27 29 31 1 3 5 7 9 11 13 15] return v2 } @@ -433,7 +432,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vpkg_xx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11] + v2 = shuffle.i8x16 v0, v1, [4 5 6 7 12 13 14 15 4 5 6 7 12 13 14 15] return v2 } @@ -443,7 +442,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vpkf_xx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [0 1 4 5 8 9 12 13 0 1 4 5 8 9 12 13] + v2 = shuffle.i8x16 v0, v1, [2 3 6 7 10 11 14 15 2 3 6 7 10 11 14 15] return v2 } @@ -453,7 +452,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vpkh_xx(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [0 2 4 6 8 10 12 14 0 2 4 6 8 10 12 14] + v2 = shuffle.i8x16 v0, v1, [1 3 5 7 9 11 13 15 1 3 5 7 9 11 13 15] return v2 } @@ -463,7 +462,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vpkg_yy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [16 17 18 19 24 25 26 27 16 17 18 19 24 25 26 27] + v2 = shuffle.i8x16 v0, v1, [20 21 22 23 28 29 30 31 20 21 22 23 28 29 30 31] return v2 } @@ -473,7 +472,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vpkf_yy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [16 17 20 21 24 25 28 29 16 17 20 21 24 25 28 29] + v2 = shuffle.i8x16 v0, v1, [18 19 22 23 26 27 30 31 18 19 22 23 26 27 30 31] return v2 } @@ -483,7 +482,7 @@ block0(v0: i8x16, v1: i8x16): function %shuffle_vpkh_yy(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): - v2 = shuffle.i8x16 v0, v1, [16 18 20 22 24 26 28 30 16 18 20 22 24 26 28 30] + v2 = shuffle.i8x16 v0, v1, [17 19 21 23 25 27 29 31 17 19 21 23 25 27 29 31] return v2 } diff --git a/cranelift/filetests/filetests/isa/s390x/vecmem-arch13.clif b/cranelift/filetests/filetests/isa/s390x/vecmem-arch13.clif index 398eab69de6e..44d167713f37 100644 --- a/cranelift/filetests/filetests/isa/s390x/vecmem-arch13.clif +++ b/cranelift/filetests/filetests/isa/s390x/vecmem-arch13.clif @@ -216,7 +216,7 @@ block0(v0: i64): } ; block0: -; vlebrg %v3, 0(%r2), 0 +; ld %f3, 0(%r2) ; vuplhb %v24, %v3 ; br %r14 @@ -227,8 +227,9 @@ block0(v0: i64): } ; block0: -; vlebrg %v3, 0(%r2), 0 -; vuplhh %v24, %v3 +; ld %f3, 0(%r2) +; verllh %v5, %v3, 8 +; vuplhh %v24, %v5 ; br %r14 function %uload32x2_little(i64) -> i64x2 { @@ -239,7 +240,8 @@ block0(v0: i64): ; block0: ; vlebrg %v3, 0(%r2), 0 -; vuplhf %v24, %v3 +; verllg %v5, %v3, 32 +; vuplhf %v24, %v5 ; br %r14 function %sload8x8_little(i64) -> i16x8 { @@ -249,7 +251,7 @@ block0(v0: i64): } ; block0: -; vlebrg %v3, 0(%r2), 0 +; ld %f3, 0(%r2) ; vuphb %v24, %v3 ; br %r14 @@ -260,8 +262,9 @@ block0(v0: i64): } ; block0: -; vlebrg %v3, 0(%r2), 0 -; vuphh %v24, %v3 +; ld %f3, 0(%r2) +; verllh %v5, %v3, 8 +; vuphh %v24, %v5 ; br %r14 function %sload32x2_little(i64) -> i64x2 { @@ -272,7 +275,8 @@ block0(v0: i64): ; block0: ; vlebrg %v3, 0(%r2), 0 -; vuphf %v24, %v3 +; verllg %v5, %v3, 32 +; vuphf %v24, %v5 ; br %r14 function %load_i8x16_little(i64) -> i8x16 { @@ -282,7 +286,7 @@ block0(v0: i64): } ; block0: -; vlbrq %v24, 0(%r2) +; vl %v24, 0(%r2) ; br %r14 function %load_i16x8_little(i64) -> i16x8 { @@ -292,7 +296,7 @@ block0(v0: i64): } ; block0: -; vlbrq %v24, 0(%r2) +; vlbrh %v24, 0(%r2) ; br %r14 function %load_i32x4_little(i64) -> i32x4 { @@ -302,7 +306,7 @@ block0(v0: i64): } ; block0: -; vlbrq %v24, 0(%r2) +; vlbrf %v24, 0(%r2) ; br %r14 function %load_i64x2_little(i64) -> i64x2 { @@ -312,7 +316,7 @@ block0(v0: i64): } ; block0: -; vlbrq %v24, 0(%r2) +; vlbrg %v24, 0(%r2) ; br %r14 function %load_i128_little(i64) -> i128 { @@ -333,7 +337,7 @@ block0(v0: i64): } ; block0: -; vlbrq %v24, 0(%r2) +; vlbrf %v24, 0(%r2) ; br %r14 function %load_f64x2_little(i64) -> f64x2 { @@ -343,7 +347,7 @@ block0(v0: i64): } ; block0: -; vlbrq %v24, 0(%r2) +; vlbrg %v24, 0(%r2) ; br %r14 function %store_i8x16_little(i8x16, i64) { @@ -353,7 +357,7 @@ block0(v0: i8x16, v1: i64): } ; block0: -; vstbrq %v24, 0(%r2) +; vst %v24, 0(%r2) ; br %r14 function %store_i16x8_little(i16x8, i64) { @@ -363,7 +367,7 @@ block0(v0: i16x8, v1: i64): } ; block0: -; vstbrq %v24, 0(%r2) +; vstbrh %v24, 0(%r2) ; br %r14 function %store_i32x4_little(i32x4, i64) { @@ -373,7 +377,7 @@ block0(v0: i32x4, v1: i64): } ; block0: -; vstbrq %v24, 0(%r2) +; vstbrf %v24, 0(%r2) ; br %r14 function %store_i64x2_little(i64x2, i64) { @@ -383,7 +387,7 @@ block0(v0: i64x2, v1: i64): } ; block0: -; vstbrq %v24, 0(%r2) +; vstbrg %v24, 0(%r2) ; br %r14 function %store_i128_little(i128, i64) { @@ -404,7 +408,7 @@ block0(v0: f32x4, v1: i64): } ; block0: -; vstbrq %v24, 0(%r2) +; vstbrf %v24, 0(%r2) ; br %r14 function %store_f64x2_little(f64x2, i64) { @@ -414,6 +418,6 @@ block0(v0: f64x2, v1: i64): } ; block0: -; vstbrq %v24, 0(%r2) +; vstbrg %v24, 0(%r2) ; br %r14 diff --git a/cranelift/filetests/filetests/isa/s390x/vecmem-le-lane-arch13.clif b/cranelift/filetests/filetests/isa/s390x/vecmem-le-lane-arch13.clif new file mode 100644 index 000000000000..d2f56f9cc02c --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vecmem-le-lane-arch13.clif @@ -0,0 +1,379 @@ +test compile precise-output +target s390x arch13 + +function %uload8x8_big(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = uload8x8 big v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuplhb %v24, %v3 +; br %r14 + +function %uload16x4_big(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = uload16x4 big v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; verllh %v5, %v3, 8 +; vuplhh %v24, %v5 +; br %r14 + +function %uload32x2_big(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = uload32x2 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; verllg %v5, %v3, 32 +; vuplhf %v24, %v5 +; br %r14 + +function %sload8x8_big(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = sload8x8 big v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuphb %v24, %v3 +; br %r14 + +function %sload16x4_big(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = sload16x4 big v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; verllh %v5, %v3, 8 +; vuphh %v24, %v5 +; br %r14 + +function %sload32x2_big(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = sload32x2 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; verllg %v5, %v3, 32 +; vuphf %v24, %v5 +; br %r14 + +function %load_i8x16_big(i64) -> i8x16 wasmtime_system_v { +block0(v0: i64): + v1 = load.i8x16 big v0 + return v1 +} + +; block0: +; vlbrq %v24, 0(%r2) +; br %r14 + +function %load_i16x8_big(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = load.i16x8 big v0 + return v1 +} + +; block0: +; vlerh %v24, 0(%r2) +; br %r14 + +function %load_i32x4_big(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.i32x4 big v0 + return v1 +} + +; block0: +; vlerf %v24, 0(%r2) +; br %r14 + +function %load_i64x2_big(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.i64x2 big v0 + return v1 +} + +; block0: +; vlerg %v24, 0(%r2) +; br %r14 + +function %load_f32x4_big(i64) -> f32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.f32x4 big v0 + return v1 +} + +; block0: +; vlerf %v24, 0(%r2) +; br %r14 + +function %load_f64x2_big(i64) -> f64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.f64x2 big v0 + return v1 +} + +; block0: +; vlerg %v24, 0(%r2) +; br %r14 + +function %store_i8x16_big(i8x16, i64) wasmtime_system_v { +block0(v0: i8x16, v1: i64): + store.i8x16 big v0, v1 + return +} + +; block0: +; vstbrq %v24, 0(%r2) +; br %r14 + +function %store_i16x8_big(i16x8, i64) wasmtime_system_v { +block0(v0: i16x8, v1: i64): + store.i16x8 big v0, v1 + return +} + +; block0: +; vsterh %v24, 0(%r2) +; br %r14 + +function %store_i32x4_big(i32x4, i64) wasmtime_system_v { +block0(v0: i32x4, v1: i64): + store.i32x4 big v0, v1 + return +} + +; block0: +; vsterf %v24, 0(%r2) +; br %r14 + +function %store_i64x2_big(i64x2, i64) wasmtime_system_v { +block0(v0: i64x2, v1: i64): + store.i64x2 big v0, v1 + return +} + +; block0: +; vsterg %v24, 0(%r2) +; br %r14 + +function %store_f32x4_big(f32x4, i64) wasmtime_system_v { +block0(v0: f32x4, v1: i64): + store.f32x4 big v0, v1 + return +} + +; block0: +; vsterf %v24, 0(%r2) +; br %r14 + +function %store_f64x2_big(f64x2, i64) wasmtime_system_v { +block0(v0: f64x2, v1: i64): + store.f64x2 big v0, v1 + return +} + +; block0: +; vsterg %v24, 0(%r2) +; br %r14 + +function %uload8x8_little(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = uload8x8 little v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuplhb %v24, %v3 +; br %r14 + +function %uload16x4_little(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = uload16x4 little v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuplhh %v24, %v3 +; br %r14 + +function %uload32x2_little(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = uload32x2 little v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuplhf %v24, %v3 +; br %r14 + +function %sload8x8_little(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = sload8x8 little v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuphb %v24, %v3 +; br %r14 + +function %sload16x4_little(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = sload16x4 little v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuphh %v24, %v3 +; br %r14 + +function %sload32x2_little(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = sload32x2 little v0 + return v1 +} + +; block0: +; vlebrg %v3, 0(%r2), 0 +; vuphf %v24, %v3 +; br %r14 + +function %load_i8x16_little(i64) -> i8x16 wasmtime_system_v { +block0(v0: i64): + v1 = load.i8x16 little v0 + return v1 +} + +; block0: +; vlbrq %v24, 0(%r2) +; br %r14 + +function %load_i16x8_little(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = load.i16x8 little v0 + return v1 +} + +; block0: +; vlbrq %v24, 0(%r2) +; br %r14 + +function %load_i32x4_little(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.i32x4 little v0 + return v1 +} + +; block0: +; vlbrq %v24, 0(%r2) +; br %r14 + +function %load_i64x2_little(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.i64x2 little v0 + return v1 +} + +; block0: +; vlbrq %v24, 0(%r2) +; br %r14 + +function %load_f32x4_little(i64) -> f32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.f32x4 little v0 + return v1 +} + +; block0: +; vlbrq %v24, 0(%r2) +; br %r14 + +function %load_f64x2_little(i64) -> f64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.f64x2 little v0 + return v1 +} + +; block0: +; vlbrq %v24, 0(%r2) +; br %r14 + +function %store_i8x16_little(i8x16, i64) wasmtime_system_v { +block0(v0: i8x16, v1: i64): + store.i8x16 little v0, v1 + return +} + +; block0: +; vstbrq %v24, 0(%r2) +; br %r14 + +function %store_i16x8_little(i16x8, i64) wasmtime_system_v { +block0(v0: i16x8, v1: i64): + store.i16x8 little v0, v1 + return +} + +; block0: +; vstbrq %v24, 0(%r2) +; br %r14 + +function %store_i32x4_little(i32x4, i64) wasmtime_system_v { +block0(v0: i32x4, v1: i64): + store.i32x4 little v0, v1 + return +} + +; block0: +; vstbrq %v24, 0(%r2) +; br %r14 + +function %store_i64x2_little(i64x2, i64) wasmtime_system_v { +block0(v0: i64x2, v1: i64): + store.i64x2 little v0, v1 + return +} + +; block0: +; vstbrq %v24, 0(%r2) +; br %r14 + +function %store_f32x4_little(f32x4, i64) wasmtime_system_v { +block0(v0: f32x4, v1: i64): + store.f32x4 little v0, v1 + return +} + +; block0: +; vstbrq %v24, 0(%r2) +; br %r14 + +function %store_f64x2_little(f64x2, i64) wasmtime_system_v { +block0(v0: f64x2, v1: i64): + store.f64x2 little v0, v1 + return +} + +; block0: +; vstbrq %v24, 0(%r2) +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vecmem-le-lane.clif b/cranelift/filetests/filetests/isa/s390x/vecmem-le-lane.clif new file mode 100644 index 000000000000..9cc031eb1214 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vecmem-le-lane.clif @@ -0,0 +1,494 @@ +test compile precise-output +target s390x + +function %uload8x8_big(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = uload8x8 big v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuplhb %v24, %v5 +; br %r14 + +function %uload16x4_big(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = uload16x4 big v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; verllh %v7, %v5, 8 +; vuplhh %v24, %v7 +; br %r14 + +function %uload32x2_big(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = uload32x2 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; verllg %v5, %v3, 32 +; vuplhf %v24, %v5 +; br %r14 + +function %sload8x8_big(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = sload8x8 big v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuphb %v24, %v5 +; br %r14 + +function %sload16x4_big(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = sload16x4 big v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; verllh %v7, %v5, 8 +; vuphh %v24, %v7 +; br %r14 + +function %sload32x2_big(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = sload32x2 big v0 + return v1 +} + +; block0: +; ld %f3, 0(%r2) +; verllg %v5, %v3, 32 +; vuphf %v24, %v5 +; br %r14 + +function %load_i8x16_big(i64) -> i8x16 wasmtime_system_v { +block0(v0: i64): + v1 = load.i8x16 big v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; lrvg %r3, 8(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %load_i16x8_big(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = load.i16x8 big v0 + return v1 +} + +; block0: +; vl %v3, 0(%r2) +; vpdi %v5, %v3, %v3, 4 +; verllg %v7, %v5, 32 +; verllf %v24, %v7, 16 +; br %r14 + +function %load_i32x4_big(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.i32x4 big v0 + return v1 +} + +; block0: +; vl %v3, 0(%r2) +; vpdi %v5, %v3, %v3, 4 +; verllg %v24, %v5, 32 +; br %r14 + +function %load_i64x2_big(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.i64x2 big v0 + return v1 +} + +; block0: +; vl %v3, 0(%r2) +; vpdi %v24, %v3, %v3, 4 +; br %r14 + +function %load_f32x4_big(i64) -> f32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.f32x4 big v0 + return v1 +} + +; block0: +; vl %v3, 0(%r2) +; vpdi %v5, %v3, %v3, 4 +; verllg %v24, %v5, 32 +; br %r14 + +function %load_f64x2_big(i64) -> f64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.f64x2 big v0 + return v1 +} + +; block0: +; vl %v3, 0(%r2) +; vpdi %v24, %v3, %v3, 4 +; br %r14 + +function %store_i8x16_big(i8x16, i64) wasmtime_system_v { +block0(v0: i8x16, v1: i64): + store.i8x16 big v0, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 0(%r2) +; strvg %r4, 8(%r2) +; br %r14 + +function %store_i16x8_big(i16x8, i64) wasmtime_system_v { +block0(v0: i16x8, v1: i64): + store.i16x8 big v0, v1 + return +} + +; block0: +; vpdi %v4, %v24, %v24, 4 +; verllg %v6, %v4, 32 +; verllf %v16, %v6, 16 +; vst %v16, 0(%r2) +; br %r14 + +function %store_i32x4_big(i32x4, i64) wasmtime_system_v { +block0(v0: i32x4, v1: i64): + store.i32x4 big v0, v1 + return +} + +; block0: +; vpdi %v4, %v24, %v24, 4 +; verllg %v6, %v4, 32 +; vst %v6, 0(%r2) +; br %r14 + +function %store_i64x2_big(i64x2, i64) wasmtime_system_v { +block0(v0: i64x2, v1: i64): + store.i64x2 big v0, v1 + return +} + +; block0: +; vpdi %v4, %v24, %v24, 4 +; vst %v4, 0(%r2) +; br %r14 + +function %store_f32x4_big(f32x4, i64) wasmtime_system_v { +block0(v0: f32x4, v1: i64): + store.f32x4 big v0, v1 + return +} + +; block0: +; vpdi %v4, %v24, %v24, 4 +; verllg %v6, %v4, 32 +; vst %v6, 0(%r2) +; br %r14 + +function %store_f64x2_big(f64x2, i64) wasmtime_system_v { +block0(v0: f64x2, v1: i64): + store.f64x2 big v0, v1 + return +} + +; block0: +; vpdi %v4, %v24, %v24, 4 +; vst %v4, 0(%r2) +; br %r14 + +function %uload8x8_little(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = uload8x8 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuplhb %v24, %v5 +; br %r14 + +function %uload16x4_little(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = uload16x4 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuplhh %v24, %v5 +; br %r14 + +function %uload32x2_little(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = uload32x2 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuplhf %v24, %v5 +; br %r14 + +function %sload8x8_little(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = sload8x8 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuphb %v24, %v5 +; br %r14 + +function %sload16x4_little(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = sload16x4 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuphh %v24, %v5 +; br %r14 + +function %sload32x2_little(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = sload32x2 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; ldgr %f5, %r5 +; vuphf %v24, %v5 +; br %r14 + +function %load_i8x16_little(i64) -> i8x16 wasmtime_system_v { +block0(v0: i64): + v1 = load.i8x16 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; lrvg %r3, 8(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %load_i16x8_little(i64) -> i16x8 wasmtime_system_v { +block0(v0: i64): + v1 = load.i16x8 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; lrvg %r3, 8(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %load_i32x4_little(i64) -> i32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.i32x4 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; lrvg %r3, 8(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %load_i64x2_little(i64) -> i64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.i64x2 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; lrvg %r3, 8(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %load_f32x4_little(i64) -> f32x4 wasmtime_system_v { +block0(v0: i64): + v1 = load.f32x4 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; lrvg %r3, 8(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %load_f64x2_little(i64) -> f64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.f64x2 little v0 + return v1 +} + +; block0: +; lrvg %r5, 0(%r2) +; lrvg %r3, 8(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %load_f64x2_sum_little(i64, i64) -> f64x2 wasmtime_system_v { +block0(v0: i64, v1: i64): + v2 = iadd.i64 v0, v1 + v3 = load.f64x2 little v2 + return v3 +} + +; block0: +; lrvg %r4, 0(%r3,%r2) +; lrvg %r5, 8(%r3,%r2) +; vlvgp %v24, %r5, %r4 +; br %r14 + +function %load_f64x2_off_little(i64) -> f64x2 wasmtime_system_v { +block0(v0: i64): + v1 = load.f64x2 little v0+128 + return v1 +} + +; block0: +; lrvg %r5, 128(%r2) +; lrvg %r3, 136(%r2) +; vlvgp %v24, %r3, %r5 +; br %r14 + +function %store_i8x16_little(i8x16, i64) wasmtime_system_v { +block0(v0: i8x16, v1: i64): + store.i8x16 little v0, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 0(%r2) +; strvg %r4, 8(%r2) +; br %r14 + +function %store_i16x8_little(i16x8, i64) wasmtime_system_v { +block0(v0: i16x8, v1: i64): + store.i16x8 little v0, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 0(%r2) +; strvg %r4, 8(%r2) +; br %r14 + +function %store_i32x4_little(i32x4, i64) wasmtime_system_v { +block0(v0: i32x4, v1: i64): + store.i32x4 little v0, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 0(%r2) +; strvg %r4, 8(%r2) +; br %r14 + +function %store_i64x2_little(i64x2, i64) wasmtime_system_v { +block0(v0: i64x2, v1: i64): + store.i64x2 little v0, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 0(%r2) +; strvg %r4, 8(%r2) +; br %r14 + +function %store_f32x4_little(f32x4, i64) wasmtime_system_v { +block0(v0: f32x4, v1: i64): + store.f32x4 little v0, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 0(%r2) +; strvg %r4, 8(%r2) +; br %r14 + +function %store_f64x2_little(f64x2, i64) wasmtime_system_v { +block0(v0: f64x2, v1: i64): + store.f64x2 little v0, v1 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 0(%r2) +; strvg %r4, 8(%r2) +; br %r14 + +function %store_f64x2_sum_little(f64x2, i64, i64) wasmtime_system_v { +block0(v0: f64x2, v1: i64, v2: i64): + v3 = iadd.i64 v1, v2 + store.f64x2 little v0, v3 + return +} + +; block0: +; vlgvg %r5, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r5, 0(%r3,%r2) +; strvg %r4, 8(%r3,%r2) +; br %r14 + +function %store_f64x2_off_little(f64x2, i64) wasmtime_system_v { +block0(v0: f64x2, v1: i64): + store.f64x2 little v0, v1+128 + return +} + +; block0: +; vlgvg %r3, %v24, 1 +; vlgvg %r4, %v24, 0 +; strvg %r3, 128(%r2) +; strvg %r4, 136(%r2) +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vecmem.clif b/cranelift/filetests/filetests/isa/s390x/vecmem.clif index c37e6b60110e..19e2782cad0b 100644 --- a/cranelift/filetests/filetests/isa/s390x/vecmem.clif +++ b/cranelift/filetests/filetests/isa/s390x/vecmem.clif @@ -216,9 +216,8 @@ block0(v0: i64): } ; block0: -; lrvg %r5, 0(%r2) -; ldgr %f5, %r5 -; vuplhb %v24, %v5 +; ld %f3, 0(%r2) +; vuplhb %v24, %v3 ; br %r14 function %uload16x4_little(i64) -> i32x4 { @@ -228,8 +227,8 @@ block0(v0: i64): } ; block0: -; lrvg %r5, 0(%r2) -; ldgr %f5, %r5 +; ld %f3, 0(%r2) +; verllh %v5, %v3, 8 ; vuplhh %v24, %v5 ; br %r14 @@ -242,7 +241,8 @@ block0(v0: i64): ; block0: ; lrvg %r5, 0(%r2) ; ldgr %f5, %r5 -; vuplhf %v24, %v5 +; verllg %v7, %v5, 32 +; vuplhf %v24, %v7 ; br %r14 function %sload8x8_little(i64) -> i16x8 { @@ -252,9 +252,8 @@ block0(v0: i64): } ; block0: -; lrvg %r5, 0(%r2) -; ldgr %f5, %r5 -; vuphb %v24, %v5 +; ld %f3, 0(%r2) +; vuphb %v24, %v3 ; br %r14 function %sload16x4_little(i64) -> i32x4 { @@ -264,8 +263,8 @@ block0(v0: i64): } ; block0: -; lrvg %r5, 0(%r2) -; ldgr %f5, %r5 +; ld %f3, 0(%r2) +; verllh %v5, %v3, 8 ; vuphh %v24, %v5 ; br %r14 @@ -278,7 +277,8 @@ block0(v0: i64): ; block0: ; lrvg %r5, 0(%r2) ; ldgr %f5, %r5 -; vuphf %v24, %v5 +; verllg %v7, %v5, 32 +; vuphf %v24, %v7 ; br %r14 function %load_i8x16_little(i64) -> i8x16 { @@ -288,9 +288,7 @@ block0(v0: i64): } ; block0: -; lrvg %r5, 0(%r2) -; lrvg %r3, 8(%r2) -; vlvgp %v24, %r3, %r5 +; vl %v24, 0(%r2) ; br %r14 function %load_i16x8_little(i64) -> i16x8 { @@ -302,7 +300,10 @@ block0(v0: i64): ; block0: ; lrvg %r5, 0(%r2) ; lrvg %r3, 8(%r2) -; vlvgp %v24, %r3, %r5 +; vlvgp %v7, %r3, %r5 +; vpdi %v17, %v7, %v7, 4 +; verllg %v19, %v17, 32 +; verllf %v24, %v19, 16 ; br %r14 function %load_i32x4_little(i64) -> i32x4 { @@ -314,7 +315,9 @@ block0(v0: i64): ; block0: ; lrvg %r5, 0(%r2) ; lrvg %r3, 8(%r2) -; vlvgp %v24, %r3, %r5 +; vlvgp %v7, %r3, %r5 +; vpdi %v17, %v7, %v7, 4 +; verllg %v24, %v17, 32 ; br %r14 function %load_i64x2_little(i64) -> i64x2 { @@ -326,7 +329,8 @@ block0(v0: i64): ; block0: ; lrvg %r5, 0(%r2) ; lrvg %r3, 8(%r2) -; vlvgp %v24, %r3, %r5 +; vlvgp %v7, %r3, %r5 +; vpdi %v24, %v7, %v7, 4 ; br %r14 function %load_i128_little(i64) -> i128 { @@ -351,7 +355,9 @@ block0(v0: i64): ; block0: ; lrvg %r5, 0(%r2) ; lrvg %r3, 8(%r2) -; vlvgp %v24, %r3, %r5 +; vlvgp %v7, %r3, %r5 +; vpdi %v17, %v7, %v7, 4 +; verllg %v24, %v17, 32 ; br %r14 function %load_f64x2_little(i64) -> f64x2 { @@ -363,7 +369,8 @@ block0(v0: i64): ; block0: ; lrvg %r5, 0(%r2) ; lrvg %r3, 8(%r2) -; vlvgp %v24, %r3, %r5 +; vlvgp %v7, %r3, %r5 +; vpdi %v24, %v7, %v7, 4 ; br %r14 function %load_f64x2_sum_little(i64, i64) -> f64x2 { @@ -376,7 +383,8 @@ block0(v0: i64, v1: i64): ; block0: ; lrvg %r4, 0(%r3,%r2) ; lrvg %r5, 8(%r3,%r2) -; vlvgp %v24, %r5, %r4 +; vlvgp %v17, %r5, %r4 +; vpdi %v24, %v17, %v17, 4 ; br %r14 function %load_f64x2_off_little(i64) -> f64x2 { @@ -388,7 +396,8 @@ block0(v0: i64): ; block0: ; lrvg %r5, 128(%r2) ; lrvg %r3, 136(%r2) -; vlvgp %v24, %r3, %r5 +; vlvgp %v7, %r3, %r5 +; vpdi %v24, %v7, %v7, 4 ; br %r14 function %store_i8x16_little(i8x16, i64) { @@ -398,10 +407,7 @@ block0(v0: i8x16, v1: i64): } ; block0: -; vlgvg %r3, %v24, 1 -; vlgvg %r4, %v24, 0 -; strvg %r3, 0(%r2) -; strvg %r4, 8(%r2) +; vst %v24, 0(%r2) ; br %r14 function %store_i16x8_little(i16x8, i64) { @@ -411,10 +417,13 @@ block0(v0: i16x8, v1: i64): } ; block0: -; vlgvg %r3, %v24, 1 -; vlgvg %r4, %v24, 0 -; strvg %r3, 0(%r2) -; strvg %r4, 8(%r2) +; vpdi %v4, %v24, %v24, 4 +; verllg %v6, %v4, 32 +; verllf %v16, %v6, 16 +; vlgvg %r4, %v16, 1 +; vlgvg %r3, %v16, 0 +; strvg %r4, 0(%r2) +; strvg %r3, 8(%r2) ; br %r14 function %store_i32x4_little(i32x4, i64) { @@ -424,8 +433,10 @@ block0(v0: i32x4, v1: i64): } ; block0: -; vlgvg %r3, %v24, 1 -; vlgvg %r4, %v24, 0 +; vpdi %v4, %v24, %v24, 4 +; verllg %v6, %v4, 32 +; vlgvg %r3, %v6, 1 +; lgdr %r4, %f6 ; strvg %r3, 0(%r2) ; strvg %r4, 8(%r2) ; br %r14 @@ -437,10 +448,11 @@ block0(v0: i64x2, v1: i64): } ; block0: -; vlgvg %r3, %v24, 1 -; vlgvg %r4, %v24, 0 -; strvg %r3, 0(%r2) -; strvg %r4, 8(%r2) +; vpdi %v4, %v24, %v24, 4 +; vlgvg %r4, %v4, 1 +; lgdr %r3, %f4 +; strvg %r4, 0(%r2) +; strvg %r3, 8(%r2) ; br %r14 function %store_i128_little(i128, i64) { @@ -464,8 +476,10 @@ block0(v0: f32x4, v1: i64): } ; block0: -; vlgvg %r3, %v24, 1 -; vlgvg %r4, %v24, 0 +; vpdi %v4, %v24, %v24, 4 +; verllg %v6, %v4, 32 +; vlgvg %r3, %v6, 1 +; lgdr %r4, %f6 ; strvg %r3, 0(%r2) ; strvg %r4, 8(%r2) ; br %r14 @@ -477,10 +491,11 @@ block0(v0: f64x2, v1: i64): } ; block0: -; vlgvg %r3, %v24, 1 -; vlgvg %r4, %v24, 0 -; strvg %r3, 0(%r2) -; strvg %r4, 8(%r2) +; vpdi %v4, %v24, %v24, 4 +; vlgvg %r4, %v4, 1 +; lgdr %r3, %f4 +; strvg %r4, 0(%r2) +; strvg %r3, 8(%r2) ; br %r14 function %store_f64x2_sum_little(f64x2, i64, i64) { @@ -491,8 +506,9 @@ block0(v0: f64x2, v1: i64, v2: i64): } ; block0: -; vlgvg %r5, %v24, 1 -; vlgvg %r4, %v24, 0 +; vpdi %v6, %v24, %v24, 4 +; vlgvg %r5, %v6, 1 +; lgdr %r4, %f6 ; strvg %r5, 0(%r3,%r2) ; strvg %r4, 8(%r3,%r2) ; br %r14 @@ -504,9 +520,10 @@ block0(v0: f64x2, v1: i64): } ; block0: -; vlgvg %r3, %v24, 1 -; vlgvg %r4, %v24, 0 -; strvg %r3, 128(%r2) -; strvg %r4, 136(%r2) +; vpdi %v4, %v24, %v24, 4 +; vlgvg %r4, %v4, 1 +; lgdr %r3, %f4 +; strvg %r4, 128(%r2) +; strvg %r3, 136(%r2) ; br %r14 diff --git a/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif b/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif index 4021e89fee42..26c8911cf719 100644 --- a/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif +++ b/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif @@ -20,7 +20,8 @@ block0(v0: i64x2, v1: i64x2, v2: i32x4): v4 = bitselect v3, v0, v1 return v4 } -; run: %mask_casted([0 0], [0xFFFFFF 0xFFFF4F], [0xFFF1 0 0xF 0]) == [0xFF000E 0xFFFF40] +; N.B. The mask is chosen such that the result is correct with either LE or BE lane order. +; run: %mask_casted([0 0], [0xFFFFFF 0xFFFF4F], [0xFFF1 0xFFF1 0xF 0xF]) == [0xFF000E 0xFFFF40] function %good_const_mask(i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-swizzle.clif b/cranelift/filetests/filetests/runtests/simd-swizzle.clif index e1c7fba879da..2c53cfcee3e8 100644 --- a/cranelift/filetests/filetests/runtests/simd-swizzle.clif +++ b/cranelift/filetests/filetests/runtests/simd-swizzle.clif @@ -12,23 +12,3 @@ block0(v0: i8x16, v1: i8x16): } ; run: %swizzle_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [0 9 15 1 6 13 7 11 10 8 100 12 4 2 3 5]) == [1 10 16 2 7 14 8 12 11 9 0 13 5 3 4 6] -function %swizzle_i16x8(i8x16, i8x16) -> i16x8 { -block0(v0: i8x16, v1: i8x16): - v2 = swizzle.i16x8 v0, v1 - return v2 -} -; run: %swizzle_i16x8([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [0 9 15 1 6 13 7 11 10 8 100 12 4 2 3 5]) == 0x060403050d00090b0c080e0702100a01 - -function %swizzle_i32x4(i8x16, i8x16) -> i32x4 { -block0(v0: i8x16, v1: i8x16): - v2 = swizzle.i32x4 v0, v1 - return v2 -} -; run: %swizzle_i32x4([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [0 9 15 1 6 13 7 11 10 8 100 12 4 2 3 5]) == 0x060403050d00090b0c080e0702100a01 - -function %swizzle_i64x2(i8x16, i8x16) -> i64x2 { -block0(v0: i8x16, v1: i8x16): - v2 = swizzle.i64x2 v0, v1 - return v2 -} -; run: %swizzle_i64x2([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [0 9 15 1 6 13 7 11 10 8 100 12 4 2 3 5]) == 0x060403050d00090b0c080e0702100a01 diff --git a/cranelift/filetests/src/function_runner.rs b/cranelift/filetests/src/function_runner.rs index ba1ca323e5fe..0502843b6b84 100644 --- a/cranelift/filetests/src/function_runner.rs +++ b/cranelift/filetests/src/function_runner.rs @@ -285,10 +285,16 @@ fn make_trampoline(signature: &ir::Signature, isa: &dyn TargetIsa) -> Function { // Calculate the type to load from memory, using integers for booleans (no encodings). let ty = param.value_type.coerce_bools_to_ints(); + // We always store vector types in little-endian byte order as DataValue. + let mut flags = ir::MemFlags::trusted(); + if param.value_type.is_vector() { + flags.set_endianness(ir::Endianness::Little); + } + // Load the value. let loaded = builder.ins().load( ty, - ir::MemFlags::trusted(), + flags, values_vec_ptr_val, (i * UnboxedValues::SLOT_SIZE) as i32, ); @@ -331,9 +337,14 @@ fn make_trampoline(signature: &ir::Signature, isa: &dyn TargetIsa) -> Function { } else { *value }; + // We always store vector types in little-endian byte order as DataValue. + let mut flags = ir::MemFlags::trusted(); + if param.value_type.is_vector() { + flags.set_endianness(ir::Endianness::Little); + } // Store the value. builder.ins().store( - ir::MemFlags::trusted(), + flags, value, values_vec_ptr_val, (i * UnboxedValues::SLOT_SIZE) as i32, @@ -400,11 +411,11 @@ mod test { block0(v0: i64, v1: i64): v2 = load.f32 notrap aligned v1 v3 = load.i8 notrap aligned v1+16 - v4 = load.i64x2 notrap aligned v1+32 + v4 = load.i64x2 notrap aligned little v1+32 v5 = load.i8 notrap aligned v1+48 v6 = icmp_imm ne v5, 0 v7, v8 = call_indirect sig0, v0(v2, v3, v4, v6) - store notrap aligned v7, v1 + store notrap aligned little v7, v1 v9 = bint.i64 v8 store notrap aligned v9, v1+16 return diff --git a/crates/cranelift/src/lib.rs b/crates/cranelift/src/lib.rs index a5bf431800eb..837c461bbbb5 100644 --- a/crates/cranelift/src/lib.rs +++ b/crates/cranelift/src/lib.rs @@ -8,7 +8,7 @@ use cranelift_codegen::ir; use cranelift_codegen::isa::{unwind::UnwindInfo, CallConv, TargetIsa}; use cranelift_entity::PrimaryMap; use cranelift_wasm::{DefinedFuncIndex, FuncIndex, WasmFuncType, WasmType}; -use target_lexicon::CallingConvention; +use target_lexicon::{Architecture, CallingConvention}; use wasmtime_environ::{ FilePos, FunctionInfo, InstructionAddressMap, ModuleTranslation, ModuleTypes, TrapInformation, }; @@ -190,6 +190,10 @@ fn func_signature( // about pointer authentication usage, so we can't just use // `CallConv::Fast`. CallConv::WasmtimeAppleAarch64 + } else if isa.triple().architecture == Architecture::S390x { + // On S390x we need a Wasmtime calling convention to ensure + // we're using little-endian vector lane order. + wasmtime_call_conv(isa) } else { CallConv::Fast }